diff --git a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java index 87611ad..dd7690e 100644 --- a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java +++ b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java @@ -60,8 +60,7 @@ public void initialize(Configuration conf, Properties tbl) throws SerDeException { - jsonSerde.initialize(conf, tbl); - jsonSerde.setWriteablesUsage(false); + jsonSerde.initialize(conf, tbl, false); StructTypeInfo rowTypeInfo = jsonSerde.getTypeInfo(); cachedObjectInspector = HCatRecordObjectInspectorFactory.getHCatRecordObjectInspector(rowTypeInfo); @@ -84,28 +83,32 @@ public void initialize(Configuration conf, Properties tbl) @Override public Object deserialize(Writable blob) throws SerDeException { try { - Object row = jsonSerde.deserialize(blob); - List fatRow = fatLand((Object[]) row); + List row = (List) jsonSerde.deserialize(blob); + List fatRow = fatLand(row); return new DefaultHCatRecord(fatRow); } catch (Exception e) { throw new SerDeException(e); } } + @SuppressWarnings({"rawtypes", "unchecked" }) - private static List fatLand(Object[] arr) { - List ret = new ArrayList<>(); - for (Object o : arr) { - if (o != null && o instanceof Map) { + private static List fatLand(final List arr) { + final List ret = new ArrayList<>(); + for (final Object o : arr) { + if (o == null) { + ret.add(null); + } else if (o instanceof Map) { ret.add(fatMap(((Map) o))); - } else if (o != null && o instanceof List) { - ret.add(fatLand(((List) o).toArray())); - } else if (o != null && o.getClass().isArray() && o.getClass().getComponentType() != byte.class) { + } else if (o instanceof List) { + ret.add(fatLand((List) o)); + } else if (o.getClass().isArray() + && o.getClass().getComponentType() != byte.class) { Class ct = o.getClass().getComponentType(); if (ct.isPrimitive()) { ret.add(primitiveArrayToList(o)); } else { - ret.add(fatLand((Object[]) o)); + ret.add(fatLand(Arrays.asList((Object[]) o))); } } else { ret.add(o); @@ -114,15 +117,14 @@ private static List fatLand(Object[] arr) { return ret; } - @SuppressWarnings("rawtypes") private static Object fatMap(Map map) { - Map ret = new LinkedHashMap<>(); + Map ret = new LinkedHashMap<>(); Set> es = map.entrySet(); for (Entry e : es) { - Object oldV = e.getValue(); - Object newV; + final Object oldV = e.getValue(); + final Object newV; if (oldV != null && oldV.getClass().isArray()) { - newV = fatLand((Object[]) oldV); + newV = fatLand(Arrays.asList((Object[]) oldV)); } else { newV = oldV; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFJsonRead.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFJsonRead.java index f5814ed..7064c80 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFJsonRead.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFJsonRead.java @@ -21,7 +21,8 @@ import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.serde2.json.HiveJsonStructReader; +import org.apache.hadoop.hive.serde2.json.HiveJsonReader; +import org.apache.hadoop.hive.serde2.json.HiveJsonReader.Feature; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; @@ -39,7 +40,7 @@ public class GenericUDFJsonRead extends GenericUDF { private TextConverter inputConverter; - private HiveJsonStructReader jsonReader; + private HiveJsonReader jsonReader; @Override public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { @@ -55,9 +56,11 @@ public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumen String typeStr = getConstantStringValue(arguments, 1); try { - TypeInfo t = TypeInfoUtils.getTypeInfoFromTypeString(typeStr); - jsonReader = new HiveJsonStructReader(t); - jsonReader.setWritablesUsage(true); + final TypeInfo t = TypeInfoUtils.getTypeInfoFromTypeString(typeStr); + final ObjectInspector oi = + TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(t); + jsonReader = new HiveJsonReader(oi); + jsonReader.enable(Feature.PRIMITIVE_TO_WRITABLE); } catch (Exception e) { throw new UDFArgumentException(getFuncName() + ": Error parsing typestring: " + e.getMessage()); } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFJsonRead.java b/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFJsonRead.java index 3016eaf..46a8864 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFJsonRead.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFJsonRead.java @@ -96,9 +96,9 @@ public void testSimpleStruct() throws Exception { udf.initialize(arguments); Object res = udf.evaluate(evalArgs("{\"a\":\"b\"}")); - assertTrue(res instanceof Object[]); - Object o[] = (Object[]) res; - assertEquals(new Text("b"), o[0]); + assertTrue(res instanceof List); + List o = (List) res; + assertEquals(new Text("b"), o.get(0)); } } @@ -109,9 +109,10 @@ public void testStructNullField() throws Exception { udf.initialize(arguments); Object res = udf.evaluate(evalArgs("{\"a\":null}")); - assertTrue(res instanceof Object[]); - Object o[] = (Object[]) res; - assertEquals(null, o[0]); + assertTrue(res instanceof List); + + List o = (List) res; + assertEquals(null, o.get(0)); } } @@ -144,9 +145,10 @@ public void testStructNullComplexField() throws Exception { udf.initialize(arguments); Object res = udf.evaluate(evalArgs("{\"a\":null}")); - assertTrue(res instanceof Object[]); - Object o[] = (Object[]) res; - assertEquals(null, o[0]); + assertTrue(res instanceof List); + + List o = (List) res; + assertEquals(null, o.get(0)); } } @@ -156,10 +158,8 @@ public void testUndeclaredStructField() throws Exception { ObjectInspector[] arguments = buildArguments("struct"); udf.initialize(arguments); - Object res = udf.evaluate(evalArgs("{\"b\":null}")); - assertTrue(res instanceof Object[]); - Object o[] = (Object[]) res; - assertEquals(null, o[0]); + // Invalid - should throw Exception + udf.evaluate(evalArgs("{\"b\":null}")); } } @@ -169,10 +169,8 @@ public void testUnexpectedStruct() throws Exception { ObjectInspector[] arguments = buildArguments("array"); udf.initialize(arguments); - Object res = udf.evaluate(evalArgs("[1,22,2,{\"b\":null}]")); - assertTrue(res instanceof Object[]); - Object o[] = (Object[]) res; - assertEquals(null, o[0]); + // Invalid - should throw Exception + udf.evaluate(evalArgs("[1,22,2,{\"b\":null}]")); } } @@ -184,7 +182,7 @@ public void testMap() throws Exception { Object res = udf.evaluate(evalArgs("{\"a\":\"v\"}")); assertTrue(res instanceof Map); - Map o = (Map) res; + Map o = (Map) res; assertEquals(1, o.size()); assertEquals(new Text("v"), o.get(new Text("a"))); } diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java index f1c8477..d0691a6 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java @@ -16,11 +16,13 @@ * specific language governing permissions and limitations * under the License. */ + package org.apache.hadoop.hive.serde2; import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.Arrays; +import java.util.Base64; import java.util.Collections; import java.util.List; import java.util.Map; @@ -30,7 +32,9 @@ import org.apache.hadoop.hive.common.type.Date; import org.apache.hadoop.hive.common.type.Timestamp; import org.apache.hadoop.hive.serde.serdeConstants; -import org.apache.hadoop.hive.serde2.json.HiveJsonStructReader; +import org.apache.hadoop.hive.serde2.json.BinaryEncoding; +import org.apache.hadoop.hive.serde2.json.HiveJsonReader; +import org.apache.hadoop.hive.serde2.json.HiveJsonReader.Feature; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -63,87 +67,146 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -@SerDeSpec(schemaProps = {serdeConstants.LIST_COLUMNS, - serdeConstants.LIST_COLUMN_TYPES, - serdeConstants.TIMESTAMP_FORMATS }) - +/** + * Hive SerDe for processing JSON formatted data. This is typically paired with + * the TextInputFormat and therefore each line provided to this SerDe must be a + * single, and complete JSON object.
+ *

Example

+ *

+ * {"name="john","age"="30"}
+ * {"name="sue","age"="32"} + *

+ */ +@SerDeSpec(schemaProps = { serdeConstants.LIST_COLUMNS, + serdeConstants.LIST_COLUMN_TYPES, serdeConstants.TIMESTAMP_FORMATS, + JsonSerDe.BINARY_FORMAT, JsonSerDe.IGNORE_EXTRA }) public class JsonSerDe extends AbstractSerDe { private static final Logger LOG = LoggerFactory.getLogger(JsonSerDe.class); + + public static final String BINARY_FORMAT = "json.binary.format"; + public static final String IGNORE_EXTRA = "text.ignore.extra.fields"; + public static final String NULL_EMPTY_LINES = "text.null.empty.line"; + private List columnNames; - private HiveJsonStructReader structReader; + private BinaryEncoding binaryEncoding; + private boolean nullEmptyLines; + + private HiveJsonReader jsonReader; private StructTypeInfo rowTypeInfo; + private StructObjectInspector soi; @Override public void initialize(Configuration conf, Properties tbl) - throws SerDeException { - List columnTypes; + throws SerDeException { + initialize(conf, tbl, true); + } + + public void initialize(final Configuration conf, final Properties tbl, + final boolean writeablePrimitives) { + LOG.debug("Initializing JsonSerDe: {}", tbl.entrySet()); // Get column names - String columnNameProperty = tbl.getProperty(serdeConstants.LIST_COLUMNS); - final String columnNameDelimiter = tbl.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) ? tbl - .getProperty(serdeConstants.COLUMN_NAME_DELIMITER) - : String.valueOf(SerDeUtils.COMMA); - // all table column names - if (columnNameProperty.isEmpty()) { - columnNames = Collections.emptyList(); - } else { - columnNames = Arrays.asList(columnNameProperty.split(columnNameDelimiter)); - } + final String columnNameProperty = + tbl.getProperty(serdeConstants.LIST_COLUMNS); + final String columnNameDelimiter = tbl.getProperty( + serdeConstants.COLUMN_NAME_DELIMITER, String.valueOf(SerDeUtils.COMMA)); + + this.columnNames = columnNameProperty.isEmpty() ? Collections.emptyList() + : Arrays.asList(columnNameProperty.split(columnNameDelimiter)); // all column types - String columnTypeProperty = tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES); - if (columnTypeProperty.isEmpty()) { - columnTypes = Collections.emptyList(); - } else { - columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); - } + final String columnTypeProperty = + tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES); + + final List columnTypes = + columnTypeProperty.isEmpty() ? Collections.emptyList() + : TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); LOG.debug("columns: {}, {}", columnNameProperty, columnNames); LOG.debug("types: {}, {} ", columnTypeProperty, columnTypes); assert (columnNames.size() == columnTypes.size()); - rowTypeInfo = (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes); + final String nullEmpty = tbl.getProperty(NULL_EMPTY_LINES, "false"); + this.nullEmptyLines = Boolean.parseBoolean(nullEmpty); + + this.rowTypeInfo = (StructTypeInfo) TypeInfoFactory + .getStructTypeInfo(columnNames, columnTypes); - TimestampParser tsParser = new TimestampParser( - HiveStringUtils.splitAndUnEscape(tbl.getProperty(serdeConstants.TIMESTAMP_FORMATS))); - structReader = new HiveJsonStructReader(rowTypeInfo, tsParser); - structReader.setIgnoreUnknownFields(true); - structReader.enableHiveColIndexParsing(true); - structReader.setWritablesUsage(true); + this.soi = (StructObjectInspector) TypeInfoUtils + .getStandardWritableObjectInspectorFromTypeInfo(this.rowTypeInfo); + + final TimestampParser tsParser; + final String parserFormats = + tbl.getProperty(serdeConstants.TIMESTAMP_FORMATS); + if (parserFormats != null) { + tsParser = + new TimestampParser(HiveStringUtils.splitAndUnEscape(parserFormats)); + } else { + tsParser = new TimestampParser(); + } + + final String binaryEncodingStr = tbl.getProperty(BINARY_FORMAT, "base64"); + this.binaryEncoding = + BinaryEncoding.valueOf(binaryEncodingStr.toUpperCase()); + + this.jsonReader = new HiveJsonReader(this.soi, tsParser); + this.jsonReader.enable(Feature.COL_INDEX_PARSING); + if (writeablePrimitives) { + this.jsonReader.enable(Feature.PRIMITIVE_TO_WRITABLE); + } + + this.jsonReader.setBinaryEncoding(binaryEncoding); + + final String ignoreExtras = tbl.getProperty(IGNORE_EXTRA, "true"); + if (Boolean.parseBoolean(ignoreExtras)) { + this.jsonReader.enable(Feature.IGNORE_UKNOWN_FIELDS); + } + + LOG.debug("JSON Struct Reader: {}", jsonReader); } /** - * Takes JSON string in Text form, and has to return an object representation above - * it that's readable by the corresponding object inspector. + * Deserialize an object out of a Writable blob containing JSON text. The + * return value of this function will be constant since the function will + * reuse the returned object. If the client wants to keep a copy of the + * object, the client needs to clone the returned value by calling + * ObjectInspectorUtils.getStandardObject(). * - * For this implementation, since we're using the jackson parser, we can construct - * our own object implementation, and we use HCatRecord for it + * @param blob The Writable (Text) object containing a serialized object + * @return A List containing all the values of the row */ @Override public Object deserialize(Writable blob) throws SerDeException { + final Text t = (Text) blob; + + if (t.getLength() == 0) { + if (!this.nullEmptyLines) { + throw new SerDeException("Encountered an empty row in the text file"); + } + final int fieldCount = soi.getAllStructFieldRefs().size(); + return Collections.nCopies(fieldCount, null); + } - Object row; - Text t = (Text) blob; try { - row = structReader.parseStruct(new ByteArrayInputStream((t.getBytes()), 0, t.getLength())); - return row; + return jsonReader.parseStruct( + new ByteArrayInputStream((t.getBytes()), 0, t.getLength())); } catch (Exception e) { - LOG.warn("Error [{}] parsing json text [{}].", e, t); + LOG.warn("Error parsing JSON text [{}].", t, e); throw new SerDeException(e); } } /** - * Given an object and object inspector pair, traverse the object - * and generate a Text representation of the object. + * Given an object and object inspector pair, traverse the object and generate + * a Text representation of the object. */ @Override public Writable serialize(Object obj, ObjectInspector objInspector) - throws SerDeException { + throws SerDeException { StringBuilder sb = new StringBuilder(); try { @@ -182,7 +245,8 @@ private static StringBuilder appendWithQuotes(StringBuilder sb, String value) { // for details - trying to enable Jackson to ignore that doesn't seem to work(compilation failure // when attempting to use that feature, so having to change the production itself. // Also, throws IOException when Binary is detected. - private static void buildJSONString(StringBuilder sb, Object o, ObjectInspector oi) throws IOException { + private void buildJSONString(StringBuilder sb, Object o, + ObjectInspector oi) throws IOException { switch (oi.getCategory()) { case PRIMITIVE: { @@ -193,7 +257,7 @@ private static void buildJSONString(StringBuilder sb, Object o, ObjectInspector switch (poi.getPrimitiveCategory()) { case BOOLEAN: { boolean b = ((BooleanObjectInspector) poi).get(o); - sb.append(b ? "true" : "false"); + sb.append(b); break; } case BYTE: { @@ -228,9 +292,8 @@ private static void buildJSONString(StringBuilder sb, Object o, ObjectInspector } case BINARY: byte[] b = ((BinaryObjectInspector) oi).getPrimitiveJavaObject(o); - Text txt = new Text(); - txt.set(b, 0, b.length); - appendWithQuotes(sb, SerDeUtils.escapeString(txt.toString())); + Text binaryText = encodeBinary(b); + appendWithQuotes(sb, SerDeUtils.escapeString(binaryText.toString())); break; case DATE: Date d = ((DateObjectInspector) poi).getPrimitiveJavaObject(o); @@ -251,10 +314,8 @@ private static void buildJSONString(StringBuilder sb, Object o, ObjectInspector break; } case CHAR: { - //this should use HiveChar.getPaddedValue() but it's protected; currently (v0.13) - // HiveChar.toString() returns getPaddedValue() - String s = SerDeUtils.escapeString( - ((HiveCharObjectInspector) poi).getPrimitiveJavaObject(o).toString()); + String s = SerDeUtils.escapeString(((HiveCharObjectInspector) poi) + .getPrimitiveJavaObject(o).getPaddedValue()); appendWithQuotes(sb, s); break; } @@ -355,6 +416,18 @@ private static void buildJSONString(StringBuilder sb, Object o, ObjectInspector } } + private Text encodeBinary(byte[] b) { + switch (this.binaryEncoding) { + case BASE64: + return new Text(Base64.getMimeEncoder().encodeToString(b)); + + case DEFAULT: + default: + Text txt = new Text(); + txt.set(b, 0, b.length); + return txt; + } + } /** * Returns an object inspector for the specified schema that @@ -362,7 +435,7 @@ private static void buildJSONString(StringBuilder sb, Object o, ObjectInspector */ @Override public ObjectInspector getObjectInspector() throws SerDeException { - return structReader.getObjectInspector(); + return jsonReader.getObjectInspector(); } @Override @@ -380,8 +453,11 @@ public StructTypeInfo getTypeInfo() { return rowTypeInfo; } - public void setWriteablesUsage(boolean b) { - structReader.setWritablesUsage(b); + public BinaryEncoding getBinaryEncoding() { + return binaryEncoding; } + public boolean isNullEmptyLines() { + return nullEmptyLines; + } } diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/json/BinaryEncoding.java b/serde/src/java/org/apache/hadoop/hive/serde2/json/BinaryEncoding.java new file mode 100644 index 0000000..58cdb9f --- /dev/null +++ b/serde/src/java/org/apache/hadoop/hive/serde2/json/BinaryEncoding.java @@ -0,0 +1,29 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.json; + +/** + * Enums describing the available String->Bytes encoding available for JSON + * parsing. This base-64 variant is what most people would think of "the + * standard" Base64 encoding for JSON: the specific MIME content transfer + * encoding. + */ +public enum BinaryEncoding { + DEFAULT, BASE64 +} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/json/HiveJsonReader.java b/serde/src/java/org/apache/hadoop/hive/serde2/json/HiveJsonReader.java new file mode 100644 index 0000000..f3e85a7 --- /dev/null +++ b/serde/src/java/org/apache/hadoop/hive/serde2/json/HiveJsonReader.java @@ -0,0 +1,519 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.json; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.io.Text; +import org.apache.hive.common.util.TimestampParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.JsonNodeType; +import com.fasterxml.jackson.databind.node.TextNode; +import com.google.common.base.Preconditions; + +/** + * This class converts JSON strings into Java or Hive Primitive objects. + * + * Support types are:
+ *
+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
JSON TypeJava TypeNotes
Objectjava.util.ListEach element may be different type + *
Arrayjava.util.ListEach element is same type
Mapjava.util.MapKeys must be same primitive type; every value is the same type
+ */ +public class HiveJsonReader { + + private static final Logger LOG = + LoggerFactory.getLogger(HiveJsonReader.class); + + private final Map, StructField> discoveredFields = + new HashMap<>(); + + private final Set> discoveredUnknownFields = + new HashSet<>(); + + private final EnumSet features = EnumSet.noneOf(Feature.class); + + private final ObjectMapper objectMapper = new ObjectMapper(); + + private final TimestampParser tsParser; + private BinaryEncoding binaryEncoding; + private final ObjectInspector oi; + + /** + * Enumeration that defines all on/off features for this reader. + */ + public enum Feature { + COL_INDEX_PARSING, PRIMITIVE_TO_WRITABLE, IGNORE_UKNOWN_FIELDS + } + + /** + * Constructor with default the Hive default timestamp parser. + * + * @param oi ObjectInspector for all the fields in the JSON object + */ + public HiveJsonReader(ObjectInspector oi) { + this(oi, new TimestampParser()); + } + + /** + * Constructor with default the Hive default timestamp parser. + * + * @param oi ObjectInspector info for all the fields in the JSON object + * @param tsParser Custom timestamp parser + */ + public HiveJsonReader(ObjectInspector oi, TimestampParser tsParser) { + this.binaryEncoding = BinaryEncoding.BASE64; + this.tsParser = tsParser; + this.oi = oi; + } + + /** + * Parse text containing a complete JSON object. + * + * @param text The text to parse + * @return A List of Objects, one for each field in the JSON object + * @throws IOException Unable to parse the JSON text + * @throws SerDeException The SerDe is not configured correctly + */ + public Object parseStruct(final String text) + throws IOException, SerDeException { + Preconditions.checkNotNull(text); + Preconditions.checkState(this.oi != null); + final JsonNode rootNode = this.objectMapper.readTree(text); + return visitNode(rootNode, this.oi); + } + + /** + * Parse text containing a complete JSON object. + * + * @param in The InputStream to read the text from + * @return A List of Objects, one for each field in the JSON object + * @throws IOException Unable to parse the JSON text + * @throws SerDeException The SerDe is not configured correctly + */ + public Object parseStruct(final InputStream in) + throws IOException, SerDeException { + Preconditions.checkNotNull(in); + Preconditions.checkState(this.oi != null); + final JsonNode rootNode = this.objectMapper.readTree(in); + return visitNode(rootNode, this.oi); + } + + /** + * Visit a node and parse it based on the provided ObjectInspector. + * + * @param rootNode The root node to process + * @param oi The ObjectInspector to use + * @return The value in this node (may be a complex type if nested) + * @throws SerDeException The SerDe is not configured correctly + */ + private Object visitNode(final JsonNode rootNode, final ObjectInspector oi) + throws SerDeException { + + if (!rootNode.isNull()) { + switch (oi.getCategory()) { + case PRIMITIVE: + final Object value = visitLeafNode(rootNode, oi); + return optionallyWrapWritable(value, oi); + case LIST: + return visitArrayNode(rootNode, oi); + case STRUCT: + return visitStructNode(rootNode, oi); + case MAP: + return visitMapNode(rootNode, oi); + default: + throw new SerDeException( + "Parsing of: " + oi.getCategory() + " is not supported"); + } + } + + return null; + } + + /** + * The typical usage of this SerDe requires that it return Hadoop Writable + * objects. However, some uses of this SerDe want the return values to be Java + * primitive objects. This SerDe works explicitly in Java primitive objects + * and will wrap the objects in Writable containers if required. + * + * @param value The Java primitive object to wrap + * @param oi The ObjectInspector provides the type to wrap into + * @return A Hadoop Writable if required; otherwise the object itself + */ + private Object optionallyWrapWritable(final Object value, + final ObjectInspector oi) { + if (!isEnabled(Feature.PRIMITIVE_TO_WRITABLE)) { + return value; + } + + final PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; + final PrimitiveTypeInfo typeInfo = poi.getTypeInfo(); + + return PrimitiveObjectInspectorFactory + .getPrimitiveJavaObjectInspector(typeInfo) + .getPrimitiveWritableObject(value); + } + + /** + * Visit a node if it is expected to be a Map (a.k.a. JSON Object) + * + * @param rootNode The node pointing at the JSON object + * @param oi The ObjectInspector to parse the Map (must be a + * MapObjectInspector) + * @return A Java Map containing the contents of the JSON map + * @throws SerDeException The SerDe is not configured correctly + */ + private Map visitMapNode(final JsonNode rootNode, + final ObjectInspector oi) throws SerDeException { + Preconditions.checkArgument(JsonNodeType.OBJECT == rootNode.getNodeType()); + + final Map ret = new LinkedHashMap<>(); + + final ObjectInspector mapKeyInspector = + ((MapObjectInspector) oi).getMapKeyObjectInspector(); + + final ObjectInspector mapValueInspector = + ((MapObjectInspector) oi).getMapValueObjectInspector(); + + if (!(mapKeyInspector instanceof PrimitiveObjectInspector)) { + throw new SerDeException("Map key must be a primitive type"); + } + + final Iterator> it = rootNode.fields(); + while (it.hasNext()) { + final Entry field = it.next(); + final Object key = + visitNode(new TextNode(field.getKey()), mapKeyInspector); + final Object val = visitNode(field.getValue(), mapValueInspector); + ret.put(key, val); + } + + return ret; + } + + /** + * Visit a node if it is expected to be a Struct data type (a.k.a. JSON + * Object) + * + * @param rootNode The node pointing at the JSON object + * @param oi The ObjectInspector to parse the Map (must be a + * StructObjectInspector) + * @return A primitive array of Objects, each element is an element of the + * struct + * @throws SerDeException The SerDe is not configured correctly + */ + private List visitStructNode(final JsonNode rootNode, + final ObjectInspector oi) throws SerDeException { + + Preconditions.checkArgument(JsonNodeType.OBJECT == rootNode.getNodeType()); + + final StructObjectInspector structInspector = (StructObjectInspector) oi; + + final int fieldCount = structInspector.getAllStructFieldRefs().size(); + final List ret = Arrays.asList(new Object[fieldCount]); + + final Iterator> it = rootNode.fields(); + while (it.hasNext()) { + final Entry field = it.next(); + final String fieldName = field.getKey(); + final JsonNode childNode = field.getValue(); + final StructField structField = + getStructField(structInspector, fieldName); + + // If the struct field is null it is because there is a field defined in + // the JSON object that was not defined in the table definition. Ignore. + if (structField != null) { + final Object childValue = + visitNode(childNode, structField.getFieldObjectInspector()); + ret.set(structField.getFieldID(), childValue); + } + } + + return ret; + } + + /** + * Visit a node if it is expected to be a JSON Array data type (a.k.a. Hive + * Array type) + * + * @param rootNode The node pointing at the JSON object + * @param oi The ObjectInspector to parse the List (must be a + * ListObjectInspector) + * @return A Java List of Objects, each element is an element of the array + * @throws SerDeException The SerDe is not configured correctly + */ + private List visitArrayNode(final JsonNode rootNode, + final ObjectInspector oi) throws SerDeException { + Preconditions.checkArgument(JsonNodeType.ARRAY == rootNode.getNodeType()); + + final ObjectInspector loi = + ((ListObjectInspector) oi).getListElementObjectInspector(); + + final List ret = new ArrayList<>(rootNode.size()); + final Iterator it = rootNode.elements(); + + while (it.hasNext()) { + final JsonNode element = it.next(); + ret.add(visitNode(element, loi)); + } + + return ret; + } + + /** + * Visit a node if it is expected to be a primitive value (JSON leaf node). + * + * @param leafNode The node pointing at the JSON object + * @param oi The ObjectInspector to parse the value (must be a + * PrimitiveObjectInspector) + * @return A Java primitive Object + * @throws SerDeException The SerDe is not configured correctly + */ + private Object visitLeafNode(final JsonNode leafNode, + final ObjectInspector oi) throws SerDeException { + Preconditions.checkArgument(leafNode.getNodeType() != JsonNodeType.OBJECT); + Preconditions.checkArgument(leafNode.getNodeType() != JsonNodeType.ARRAY); + + final PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; + final PrimitiveTypeInfo typeInfo = poi.getTypeInfo(); + + switch (typeInfo.getPrimitiveCategory()) { + case INT: + return Integer.valueOf(leafNode.asInt()); + case BYTE: + return Byte.valueOf((byte) leafNode.asInt()); + case SHORT: + return Short.valueOf((short) leafNode.asInt()); + case LONG: + return Long.valueOf(leafNode.asLong()); + case BOOLEAN: + return Boolean.valueOf(leafNode.asBoolean()); + case FLOAT: + return Float.valueOf((float) leafNode.asDouble()); + case DOUBLE: + return Double.valueOf(leafNode.asDouble()); + case STRING: + return leafNode.asText(); + case BINARY: + return getByteValue(leafNode); + case DATE: + return Date.valueOf(leafNode.asText()); + case TIMESTAMP: + return tsParser.parseTimestamp(leafNode.asText()); + case DECIMAL: + return HiveDecimal.create(leafNode.asText()); + case VARCHAR: + return new HiveVarchar(leafNode.asText(), + ((BaseCharTypeInfo) typeInfo).getLength()); + case CHAR: + return new HiveChar(leafNode.asText(), + ((BaseCharTypeInfo) typeInfo).getLength()); + default: + throw new SerDeException("Could not convert from string to map type " + + typeInfo.getTypeName()); + } + } + + /** + * A user may configure the encoding for binary data represented as text + * within a JSON object. This method applies that encoding to the text. + * + * @param binaryNode JSON Node containing the binary data + * @return A byte array with the binary data + * @throws SerDeException The SerDe is not configured correctly + */ + private byte[] getByteValue(final JsonNode binaryNode) throws SerDeException { + try { + switch (this.binaryEncoding) { + case DEFAULT: + final String byteText = binaryNode.textValue(); + final byte[] buff = byteText.getBytes(StandardCharsets.UTF_8); + return Text.decode(buff, 0, buff.length) + .getBytes(StandardCharsets.UTF_8); + case BASE64: + return binaryNode.binaryValue(); + default: + break; + } + } catch (Exception e) { + throw new SerDeException("Error generating JSON binary type from record.", + e); + } + throw new SerDeException("Error generating JSON binary type from record."); + } + + /** + * Matches the JSON object's field name with the Hive data type. + * + * @param oi The ObjectInsepctor to lookup the matching in + * @param fieldName The name of the field parsed from the JSON text + * @return The meta data of regarding this field + * @throws SerDeException The SerDe is not configured correctly + */ + private StructField getStructField(final StructObjectInspector oi, + final String fieldName) throws SerDeException { + + final Pair pair = + ImmutablePair.of(oi, fieldName); + + // Ignore the field if it has been ignored before + if (this.discoveredUnknownFields.contains(pair)) { + return null; + } + + // Return from cache if the field has already been discovered + StructField structField = this.discoveredFields.get(pair); + if (structField != null) { + return structField; + } + + // Otherwise attempt to discover the field + if (isEnabled(Feature.COL_INDEX_PARSING)) { + int colIndex = getColIndex(fieldName); + if (colIndex >= 0) { + structField = oi.getAllStructFieldRefs().get(colIndex); + } + } + if (structField == null) { + try { + structField = oi.getStructFieldRef(fieldName); + } catch (Exception e) { + // No such field + } + } + if (structField != null) { + // cache it for next time + this.discoveredFields.put(pair, structField); + } else { + // Tried everything and did not discover this field + if (isEnabled(Feature.IGNORE_UKNOWN_FIELDS)) { + if (this.discoveredUnknownFields.add(pair)) { + LOG.warn("Discovered unknown field: {}. Ignoring.", fieldName); + } + } else { + throw new SerDeException( + "Field found in JSON does not match table definition: " + + fieldName); + } + } + + return structField; + } + + private final Pattern internalPattern = Pattern.compile("^_col([0-9]+)$"); + + /** + * Look up a column based on its index. + * + * @param internalName The name of the column + * @return The index of the field or -1 if the field name does not contain its + * index number too + */ + private int getColIndex(final String internalName) { + // The above line should have been all the implementation that + // we need, but due to a bug in that impl which recognizes + // only single-digit columns, we need another impl here. + final Matcher m = internalPattern.matcher(internalName); + return m.matches() ? Integer.valueOf(m.group(1)).intValue() : -1; + } + + public void enable(Feature feature) { + this.features.add(feature); + } + + public void disable(Feature feature) { + this.features.remove(feature); + } + + public Set getFeatures() { + return Collections.unmodifiableSet(this.features); + } + + public boolean isEnabled(Feature feature) { + return this.features.contains(feature); + } + + public ObjectInspector getObjectInspector() { + return oi; + } + + public BinaryEncoding getBinaryEncodingType() { + return binaryEncoding; + } + + public void setBinaryEncoding(BinaryEncoding encoding) { + this.binaryEncoding = encoding; + } + +} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/json/HiveJsonStructReader.java b/serde/src/java/org/apache/hadoop/hive/serde2/json/HiveJsonStructReader.java deleted file mode 100644 index ec4efad..0000000 --- a/serde/src/java/org/apache/hadoop/hive/serde2/json/HiveJsonStructReader.java +++ /dev/null @@ -1,402 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.serde2.json; - -import java.io.IOException; -import java.io.InputStream; -import java.nio.charset.CharacterCodingException; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.hadoop.hive.common.type.Date; -import org.apache.hadoop.hive.common.type.HiveChar; -import org.apache.hadoop.hive.common.type.HiveDecimal; -import org.apache.hadoop.hive.common.type.HiveVarchar; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; -import org.apache.hadoop.io.Text; -import org.apache.hive.common.util.TimestampParser; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.fasterxml.jackson.core.JsonFactory; -import com.fasterxml.jackson.core.JsonParseException; -import com.fasterxml.jackson.core.JsonParser; -import com.fasterxml.jackson.core.JsonToken; - -public class HiveJsonStructReader { - - private static final Logger LOG = LoggerFactory.getLogger(HiveJsonStructReader.class); - - private ObjectInspector oi; - private JsonFactory factory; - - - Set reportedUnknownFieldNames = new HashSet<>(); - - private static boolean ignoreUnknownFields; - private static boolean hiveColIndexParsing; - private boolean writeablePrimitives; - - private TimestampParser tsParser; - - public HiveJsonStructReader(TypeInfo t) { - this(t, new TimestampParser()); - } - - public HiveJsonStructReader(TypeInfo t, TimestampParser tsParser) { - this.tsParser = tsParser; - oi = TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(t); - factory = new JsonFactory(); - } - - public Object parseStruct(String text) throws JsonParseException, IOException, SerDeException { - JsonParser parser = factory.createParser(text); - return parseInternal(parser); - } - - public Object parseStruct(InputStream is) throws JsonParseException, IOException, SerDeException { - JsonParser parser = factory.createParser(is); - return parseInternal(parser); - } - - private Object parseInternal(JsonParser parser) throws SerDeException { - try { - parser.nextToken(); - Object res = parseDispatcher(parser, oi); - return res; - } catch (Exception e) { - String locationStr = parser.getCurrentLocation().getLineNr() + "," + parser.getCurrentLocation().getColumnNr(); - throw new SerDeException("at[" + locationStr + "]: " + e.getMessage(), e); - } - } - - private Object parseDispatcher(JsonParser parser, ObjectInspector oi) - throws JsonParseException, IOException, SerDeException { - - switch (oi.getCategory()) { - case PRIMITIVE: - return parsePrimitive(parser, (PrimitiveObjectInspector) oi); - case LIST: - return parseList(parser, (ListObjectInspector) oi); - case STRUCT: - return parseStruct(parser, (StructObjectInspector) oi); - case MAP: - return parseMap(parser, (MapObjectInspector) oi); - default: - throw new SerDeException("parsing of: " + oi.getCategory() + " is not handled"); - } - } - - private Object parseMap(JsonParser parser, MapObjectInspector oi) throws IOException, SerDeException { - - if (parser.getCurrentToken() == JsonToken.VALUE_NULL) { - parser.nextToken(); - return null; - } - - Map ret = new LinkedHashMap<>(); - - if (parser.getCurrentToken() != JsonToken.START_OBJECT) { - throw new SerDeException("struct expected"); - } - - if (!(oi.getMapKeyObjectInspector() instanceof PrimitiveObjectInspector)) { - throw new SerDeException("map key must be a primitive"); - } - PrimitiveObjectInspector keyOI = (PrimitiveObjectInspector) oi.getMapKeyObjectInspector(); - ObjectInspector valOI = oi.getMapValueObjectInspector(); - - JsonToken currentToken = parser.nextToken(); - while (currentToken != null && currentToken != JsonToken.END_OBJECT) { - - if (currentToken != JsonToken.FIELD_NAME) { - throw new SerDeException("unexpected token: " + currentToken); - } - - Object key = parseMapKey(parser, keyOI); - Object val = parseDispatcher(parser, valOI); - ret.put(key, val); - - currentToken = parser.getCurrentToken(); - } - if (currentToken != null) { - parser.nextToken(); - } - return ret; - - } - - private Object parseStruct(JsonParser parser, StructObjectInspector oi) - throws JsonParseException, IOException, SerDeException { - - Object[] ret = new Object[oi.getAllStructFieldRefs().size()]; - - if (parser.getCurrentToken() == JsonToken.VALUE_NULL) { - parser.nextToken(); - return null; - } - if (parser.getCurrentToken() != JsonToken.START_OBJECT) { - throw new SerDeException("struct expected"); - } - JsonToken currentToken = parser.nextToken(); - while (currentToken != null && currentToken != JsonToken.END_OBJECT) { - - switch (currentToken) { - case FIELD_NAME: - String name = parser.getCurrentName(); - try { - StructField field = null; - try { - field = getStructField(oi, name); - } catch (RuntimeException e) { - if (ignoreUnknownFields) { - if (!reportedUnknownFieldNames.contains(name)) { - LOG.warn("ignoring field:" + name); - reportedUnknownFieldNames.add(name); - } - parser.nextToken(); - skipValue(parser); - break; - } - } - if (field == null) { - throw new SerDeException("undeclared field"); - } - parser.nextToken(); - ret[field.getFieldID()] = parseDispatcher(parser, field.getFieldObjectInspector()); - } catch (Exception e) { - throw new SerDeException("struct field " + name + ": " + e.getMessage(), e); - } - break; - default: - throw new SerDeException("unexpected token: " + currentToken); - } - currentToken = parser.getCurrentToken(); - } - if (currentToken != null) { - parser.nextToken(); - } - return ret; - } - - private StructField getStructField(StructObjectInspector oi, String name) { - if (hiveColIndexParsing) { - int colIndex = getColIndex(name); - if (colIndex >= 0) { - return oi.getAllStructFieldRefs().get(colIndex); - } - } - // FIXME: linear scan inside the below method...get a map here or something.. - return oi.getStructFieldRef(name); - } - - Pattern internalPattern = Pattern.compile("^_col([0-9]+)$"); - - private int getColIndex(String internalName) { - // The above line should have been all the implementation that - // we need, but due to a bug in that impl which recognizes - // only single-digit columns, we need another impl here. - Matcher m = internalPattern.matcher(internalName); - if (!m.matches()) { - return -1; - } else { - return Integer.parseInt(m.group(1)); - } - } - - private static void skipValue(JsonParser parser) throws JsonParseException, IOException { - - int array = 0; - int object = 0; - do { - JsonToken currentToken = parser.getCurrentToken(); - if(currentToken == JsonToken.START_ARRAY) { - array++; - } - if (currentToken == JsonToken.END_ARRAY) { - array--; - } - if (currentToken == JsonToken.START_OBJECT) { - object++; - } - if (currentToken == JsonToken.END_OBJECT) { - object--; - } - - parser.nextToken(); - - } while (array > 0 || object > 0); - - } - - private Object parseList(JsonParser parser, ListObjectInspector oi) - throws JsonParseException, IOException, SerDeException { - List ret = new ArrayList<>(); - - if (parser.getCurrentToken() == JsonToken.VALUE_NULL) { - parser.nextToken(); - return null; - } - - if (parser.getCurrentToken() != JsonToken.START_ARRAY) { - throw new SerDeException("array expected"); - } - ObjectInspector eOI = oi.getListElementObjectInspector(); - JsonToken currentToken = parser.nextToken(); - try { - while (currentToken != null && currentToken != JsonToken.END_ARRAY) { - ret.add(parseDispatcher(parser, eOI)); - currentToken = parser.getCurrentToken(); - } - } catch (Exception e) { - throw new SerDeException("array: " + e.getMessage(), e); - } - - currentToken = parser.nextToken(); - - return ret; - } - - private Object parsePrimitive(JsonParser parser, PrimitiveObjectInspector oi) - throws SerDeException, IOException { - JsonToken currentToken = parser.getCurrentToken(); - if (currentToken == null) { - return null; - } - try { - switch (parser.getCurrentToken()) { - case VALUE_FALSE: - case VALUE_TRUE: - case VALUE_NUMBER_INT: - case VALUE_NUMBER_FLOAT: - case VALUE_STRING: - return getObjectOfCorrespondingPrimitiveType(parser.getValueAsString(), oi); - case VALUE_NULL: - return null; - default: - throw new SerDeException("unexpected token type: " + currentToken); - } - } finally { - parser.nextToken(); - - } - } - - private Object getObjectOfCorrespondingPrimitiveType(String s, PrimitiveObjectInspector oi) - throws IOException { - PrimitiveTypeInfo typeInfo = oi.getTypeInfo(); - if (writeablePrimitives) { - Converter c = ObjectInspectorConverters.getConverter(PrimitiveObjectInspectorFactory.javaStringObjectInspector, oi); - return c.convert(s); - } - - switch (typeInfo.getPrimitiveCategory()) { - case INT: - return Integer.valueOf(s); - case BYTE: - return Byte.valueOf(s); - case SHORT: - return Short.valueOf(s); - case LONG: - return Long.valueOf(s); - case BOOLEAN: - return (s.equalsIgnoreCase("true")); - case FLOAT: - return Float.valueOf(s); - case DOUBLE: - return Double.valueOf(s); - case STRING: - return s; - case BINARY: - try { - String t = Text.decode(s.getBytes(), 0, s.getBytes().length); - return t.getBytes(); - } catch (CharacterCodingException e) { - LOG.warn("Error generating json binary type from object.", e); - return null; - } - case DATE: - return Date.valueOf(s); - case TIMESTAMP: - return tsParser.parseTimestamp(s); - case DECIMAL: - return HiveDecimal.create(s); - case VARCHAR: - return new HiveVarchar(s, ((BaseCharTypeInfo) typeInfo).getLength()); - case CHAR: - return new HiveChar(s, ((BaseCharTypeInfo) typeInfo).getLength()); - } - throw new IOException("Could not convert from string to map type " + typeInfo.getTypeName()); - } - - private Object parseMapKey(JsonParser parser, PrimitiveObjectInspector oi) throws SerDeException, IOException { - JsonToken currentToken = parser.getCurrentToken(); - if (currentToken == null) { - return null; - } - try { - switch (parser.getCurrentToken()) { - case FIELD_NAME: - return getObjectOfCorrespondingPrimitiveType(parser.getValueAsString(), oi); - case VALUE_NULL: - return null; - default: - throw new SerDeException("unexpected token type: " + currentToken); - } - } finally { - parser.nextToken(); - - } - } - - public void setIgnoreUnknownFields(boolean b) { - ignoreUnknownFields = b; - } - - public void enableHiveColIndexParsing(boolean b) { - hiveColIndexParsing = b; - } - - public void setWritablesUsage(boolean b) { - writeablePrimitives = b; - } - - public ObjectInspector getObjectInspector() { - return oi; - } -} diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/TestJsonSerDe.java b/serde/src/test/org/apache/hadoop/hive/serde2/TestJsonSerDe.java new file mode 100644 index 0000000..83ae49e --- /dev/null +++ b/serde/src/test/org/apache/hadoop/hive/serde2/TestJsonSerDe.java @@ -0,0 +1,211 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2; + +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Properties; + +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.hive.common.type.Timestamp; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.io.Text; +import org.junit.Assert; +import org.junit.Test; + +/** + * Test suite for the JSON SerDe class. + */ +public class TestJsonSerDe { + + @Test + public void testPrimativeDataTypes() throws Exception { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + props.setProperty(serdeConstants.LIST_COLUMN_TYPES, + "string,float,int,boolean,timestamp"); + props.setProperty(serdeConstants.TIMESTAMP_FORMATS, "millis"); + + JsonSerDe serde = new JsonSerDe(); + serde.initialize(null, props, false); + + final String jsonText = loadJson("simple.json"); + + final Text text = new Text(jsonText); + final List results = (List) serde.deserialize(text); + + Assert.assertNotNull(results); + Assert.assertEquals(5, results.size()); + Assert.assertEquals("giraffe", results.get(0)); + Assert.assertEquals(5.5f, results.get(1)); + Assert.assertEquals(1360, results.get(2)); + Assert.assertEquals(true, results.get(3)); + Assert.assertEquals(Timestamp.ofEpochMilli(1549751270013L), results.get(4)); + } + + @Test + public void testArray() throws Exception { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, "list,items"); + props.setProperty(serdeConstants.LIST_COLUMN_TYPES, "string,array"); + props.setProperty(serdeConstants.TIMESTAMP_FORMATS, "millis"); + + JsonSerDe serde = new JsonSerDe(); + serde.initialize(null, props, false); + + final String jsonText = loadJson("array.json"); + + final Text text = new Text(jsonText); + final List results = (List) serde.deserialize(text); + + Assert.assertNotNull(results); + Assert.assertEquals(2, results.size()); + Assert.assertEquals("grocery", results.get(0)); + Assert.assertEquals(Arrays.asList("milk", "eggs", "bread"), results.get(1)); + } + + /** + * Test when a map has a key defined as a numeric value. Technically, JSON + * does not support this because each key in a map must be a quoted string. + * Unquoted strings (hence an int value) is allowed by Javascript, but not by + * JSON specification. For Hive, the int map key type is stored as a string + * and must be converted back into an int type. + */ + @Test + public void testMapNumericKey() throws Exception { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, "map"); + props.setProperty(serdeConstants.LIST_COLUMN_TYPES, "map"); + props.setProperty(serdeConstants.TIMESTAMP_FORMATS, "millis"); + + JsonSerDe serde = new JsonSerDe(); + serde.initialize(null, props, false); + + final String jsonText = loadJson("map_int_key.json"); + + final Text text = new Text(jsonText); + final List results = (List) serde.deserialize(text); + + Assert.assertNotNull(results); + Assert.assertEquals(1, results.size()); + + Map resultMap = (Map) results.get(0); + Object value1 = resultMap.get(1); + Object value2 = resultMap.get(2); + + Assert.assertEquals(2, resultMap.size()); + Assert.assertEquals("2001-01-01", value1); + Assert.assertEquals(null, value2); + } + + @Test + public void testBlankLineAllowed() throws Exception { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, "a,b,c"); + props.setProperty(serdeConstants.LIST_COLUMN_TYPES, "int,int,int"); + props.setProperty(serdeConstants.TIMESTAMP_FORMATS, "millis"); + + // This is the test parameter + props.setProperty(JsonSerDe.NULL_EMPTY_LINES, "true"); + + JsonSerDe serde = new JsonSerDe(); + serde.initialize(null, props, false); + + final Text text = new Text(""); + final List results = (List) serde.deserialize(text); + Assert.assertEquals(Arrays.asList(null, null, null), results); + } + + @Test(expected = SerDeException.class) + public void testBlankLineException() throws Exception { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, "a,b,c"); + props.setProperty(serdeConstants.LIST_COLUMN_TYPES, "int,int,int"); + props.setProperty(serdeConstants.TIMESTAMP_FORMATS, "millis"); + + // This is the test parameter + props.setProperty(JsonSerDe.NULL_EMPTY_LINES, "false"); + + JsonSerDe serde = new JsonSerDe(); + serde.initialize(null, props, false); + + serde.deserialize(new Text("")); + } + + @Test + public void testChar() throws Exception { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, "a"); + props.setProperty(serdeConstants.LIST_COLUMN_TYPES, "char(5)"); + props.setProperty(serdeConstants.TIMESTAMP_FORMATS, "millis"); + + JsonSerDe serde = new JsonSerDe(); + serde.initialize(null, props, false); + + List results = (List) serde.deserialize(new Text("{\"a\":\"xxx\"}")); + Assert.assertNotNull(results); + Assert.assertEquals(1, results.size()); + Assert.assertEquals("xxx ", results.get(0).toString()); + } + + /** + * When parsing the JSON object, a cache is kept for the definition of each + * field and it index in its most immediate struct. Check that if two names + * have the same name, in the same index of their respective structs, that + * they are not confused with one-another. + */ + @Test + public void testCacheIndexSameFieldName() throws Exception { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, "a,b"); + props.setProperty(serdeConstants.LIST_COLUMN_TYPES, "int,struct"); + props.setProperty(serdeConstants.TIMESTAMP_FORMATS, "millis"); + + JsonSerDe serde = new JsonSerDe(); + serde.initialize(null, props, false); + + List results = (List) serde.deserialize(new Text("{\"a\":5,\"b\":{\"a\":true}}")); + Assert.assertNotNull(results); + Assert.assertEquals(2, results.size()); + Assert.assertEquals(5, results.get(0)); + Assert.assertEquals(Arrays.asList(true), results.get(1)); + } + + /** + * Accepts a file name and loads it from /src/test/resource/json + * + * @param resourceName The name of the file to load + * @return A JSON string, all whitespace removed + * @throws IOException Failed to load the file + */ + private String loadJson(final String resourceName) throws IOException { + final String path = "/json/" + resourceName; + final URL url = this.getClass().getResource(path); + final File testJson = new File(url.getFile()); + final String jsonText = + FileUtils.readFileToString(testJson, StandardCharsets.UTF_8); + return jsonText.replaceAll("\\s+", ""); + } +} diff --git a/serde/src/test/resources/json/array.json b/serde/src/test/resources/json/array.json new file mode 100644 index 0000000..f0b2ac7 --- /dev/null +++ b/serde/src/test/resources/json/array.json @@ -0,0 +1,8 @@ +{ + "list": "grocery", + "items": [ + "milk", + "eggs", + "bread" + ] +} \ No newline at end of file diff --git a/serde/src/test/resources/json/map_int_key.json b/serde/src/test/resources/json/map_int_key.json new file mode 100644 index 0000000..786b6e4 --- /dev/null +++ b/serde/src/test/resources/json/map_int_key.json @@ -0,0 +1,6 @@ +{ + "map": { + "1": "2001-01-01", + "2": null + } +} \ No newline at end of file diff --git a/serde/src/test/resources/json/simple.json b/serde/src/test/resources/json/simple.json new file mode 100644 index 0000000..c35d847 --- /dev/null +++ b/serde/src/test/resources/json/simple.json @@ -0,0 +1,7 @@ +{ + "name": "giraffe", + "height": 5.5, + "weight": 1360, + "endangered": true, + "born": 1549751270013 +} \ No newline at end of file