diff --git a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java index 87611ad..9696a93 100644 --- a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java +++ b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java @@ -60,8 +60,7 @@ public void initialize(Configuration conf, Properties tbl) throws SerDeException { - jsonSerde.initialize(conf, tbl); - jsonSerde.setWriteablesUsage(false); + jsonSerde.initialize(conf, tbl, false); StructTypeInfo rowTypeInfo = jsonSerde.getTypeInfo(); cachedObjectInspector = HCatRecordObjectInspectorFactory.getHCatRecordObjectInspector(rowTypeInfo); @@ -82,10 +81,11 @@ public void initialize(Configuration conf, Properties tbl) * our own object implementation, and we use HCatRecord for it */ @Override + @SuppressWarnings("unchecked") public Object deserialize(Writable blob) throws SerDeException { try { - Object row = jsonSerde.deserialize(blob); - List fatRow = fatLand((Object[]) row); + List row = (List) jsonSerde.deserialize(blob); + List fatRow = fatLand(row); return new DefaultHCatRecord(fatRow); } catch (Exception e) { throw new SerDeException(e); @@ -93,19 +93,22 @@ public Object deserialize(Writable blob) throws SerDeException { } @SuppressWarnings({"rawtypes", "unchecked" }) - private static List fatLand(Object[] arr) { - List ret = new ArrayList<>(); - for (Object o : arr) { - if (o != null && o instanceof Map) { + private static List fatLand(final List arr) { + final List ret = new ArrayList<>(); + for (final Object o : arr) { + if (o == null) { + ret.add(null); + } else if (o instanceof Map) { ret.add(fatMap(((Map) o))); - } else if (o != null && o instanceof List) { - ret.add(fatLand(((List) o).toArray())); - } else if (o != null && o.getClass().isArray() && o.getClass().getComponentType() != byte.class) { + } else if (o instanceof List) { + ret.add(fatLand((List) o)); + } else if (o.getClass().isArray() + && o.getClass().getComponentType() != byte.class) { Class ct = o.getClass().getComponentType(); if (ct.isPrimitive()) { ret.add(primitiveArrayToList(o)); } else { - ret.add(fatLand((Object[]) o)); + ret.add(Arrays.asList((Object[]) o)); } } else { ret.add(o); @@ -114,15 +117,14 @@ private static List fatLand(Object[] arr) { return ret; } - @SuppressWarnings("rawtypes") private static Object fatMap(Map map) { - Map ret = new LinkedHashMap<>(); + Map ret = new LinkedHashMap<>(); Set> es = map.entrySet(); for (Entry e : es) { - Object oldV = e.getValue(); - Object newV; + final Object oldV = e.getValue(); + final Object newV; if (oldV != null && oldV.getClass().isArray()) { - newV = fatLand((Object[]) oldV); + newV = fatLand(Arrays.asList((Object[]) oldV)); } else { newV = oldV; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFJsonRead.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFJsonRead.java index f5814ed..1838383 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFJsonRead.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFJsonRead.java @@ -57,7 +57,7 @@ public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumen try { TypeInfo t = TypeInfoUtils.getTypeInfoFromTypeString(typeStr); jsonReader = new HiveJsonStructReader(t); - jsonReader.setWritablesUsage(true); + jsonReader.setWriteablePrimitives(true); } catch (Exception e) { throw new UDFArgumentException(getFuncName() + ": Error parsing typestring: " + e.getMessage()); } diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java index f1c8477..80dc07f 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java @@ -16,11 +16,13 @@ * specific language governing permissions and limitations * under the License. */ + package org.apache.hadoop.hive.serde2; import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.Arrays; +import java.util.Base64; import java.util.Collections; import java.util.List; import java.util.Map; @@ -30,6 +32,7 @@ import org.apache.hadoop.hive.common.type.Date; import org.apache.hadoop.hive.common.type.Timestamp; import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.json.BinaryEncoding; import org.apache.hadoop.hive.serde2.json.HiveJsonStructReader; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; @@ -63,87 +66,130 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -@SerDeSpec(schemaProps = {serdeConstants.LIST_COLUMNS, - serdeConstants.LIST_COLUMN_TYPES, - serdeConstants.TIMESTAMP_FORMATS }) - +@SerDeSpec(schemaProps = { serdeConstants.LIST_COLUMNS, + serdeConstants.LIST_COLUMN_TYPES, serdeConstants.TIMESTAMP_FORMATS, + JsonSerDe.BINARY_FORMAT, JsonSerDe.IGNORE_EXTRA }) public class JsonSerDe extends AbstractSerDe { private static final Logger LOG = LoggerFactory.getLogger(JsonSerDe.class); + + public static final String BINARY_FORMAT = "json.binary.format"; + public static final String IGNORE_EXTRA = "text.ignore.extra.fields"; + public static final String NULL_EMPTY_LINES = "text.null.empty.line"; + private List columnNames; + private BinaryEncoding binaryEncoding; + private boolean nullEmptyLines; + private HiveJsonStructReader structReader; private StructTypeInfo rowTypeInfo; @Override public void initialize(Configuration conf, Properties tbl) - throws SerDeException { - List columnTypes; + throws SerDeException { + initialize(conf, tbl, true); + } + + public void initialize(final Configuration conf, final Properties tbl, + final boolean writeablePrimitives) { + LOG.debug("Initializing JsonSerDe: {}", tbl.entrySet()); // Get column names - String columnNameProperty = tbl.getProperty(serdeConstants.LIST_COLUMNS); - final String columnNameDelimiter = tbl.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) ? tbl - .getProperty(serdeConstants.COLUMN_NAME_DELIMITER) - : String.valueOf(SerDeUtils.COMMA); - // all table column names - if (columnNameProperty.isEmpty()) { - columnNames = Collections.emptyList(); - } else { - columnNames = Arrays.asList(columnNameProperty.split(columnNameDelimiter)); - } + final String columnNameProperty = + tbl.getProperty(serdeConstants.LIST_COLUMNS); + final String columnNameDelimiter = tbl.getProperty( + serdeConstants.COLUMN_NAME_DELIMITER, String.valueOf(SerDeUtils.COMMA)); + + this.columnNames = columnNameProperty.isEmpty() ? Collections.emptyList() + : Arrays.asList(columnNameProperty.split(columnNameDelimiter)); // all column types - String columnTypeProperty = tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES); - if (columnTypeProperty.isEmpty()) { - columnTypes = Collections.emptyList(); - } else { - columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); - } + final String columnTypeProperty = + tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES); + + final List columnTypes = + columnTypeProperty.isEmpty() ? Collections.emptyList() + : TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); LOG.debug("columns: {}, {}", columnNameProperty, columnNames); LOG.debug("types: {}, {} ", columnTypeProperty, columnTypes); assert (columnNames.size() == columnTypes.size()); - rowTypeInfo = (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes); + final String nullEmpty = tbl.getProperty(NULL_EMPTY_LINES, "false"); + this.nullEmptyLines = Boolean.parseBoolean(nullEmpty); + + this.rowTypeInfo = (StructTypeInfo) TypeInfoFactory + .getStructTypeInfo(columnNames, columnTypes); + + final TimestampParser tsParser; + final String parserFormats = + tbl.getProperty(serdeConstants.TIMESTAMP_FORMATS); + if (parserFormats != null) { + tsParser = + new TimestampParser(HiveStringUtils.splitAndUnEscape(parserFormats)); + } else { + tsParser = new TimestampParser(); + } - TimestampParser tsParser = new TimestampParser( - HiveStringUtils.splitAndUnEscape(tbl.getProperty(serdeConstants.TIMESTAMP_FORMATS))); - structReader = new HiveJsonStructReader(rowTypeInfo, tsParser); - structReader.setIgnoreUnknownFields(true); - structReader.enableHiveColIndexParsing(true); - structReader.setWritablesUsage(true); + final String binaryEncodingStr = tbl.getProperty(BINARY_FORMAT, "base64"); + this.binaryEncoding = + BinaryEncoding.valueOf(binaryEncodingStr.toUpperCase()); + + this.structReader = new HiveJsonStructReader(rowTypeInfo, tsParser); + this.structReader.enableHiveColIndexParsing(true); + this.structReader.setWriteablePrimitives(writeablePrimitives); + + this.structReader.setBinaryEncoding(binaryEncoding); + + final String ignoreExtras = tbl.getProperty(IGNORE_EXTRA, "false"); + this.structReader + .setIgnoreUnknownFields(Boolean.parseBoolean(ignoreExtras)); + + LOG.debug("JSON Struct Reader: {}", structReader); } /** - * Takes JSON string in Text form, and has to return an object representation above - * it that's readable by the corresponding object inspector. + * Deserialize an object out of a Writable blob containing JSON text. The + * return value of this function will be constant since the function will + * reuse the returned object. If the client wants to keep a copy of the + * object, the client needs to clone the returned value by calling + * ObjectInspectorUtils.getStandardObject(). * - * For this implementation, since we're using the jackson parser, we can construct - * our own object implementation, and we use HCatRecord for it + * @param blob The Writable (Text) object containing a serialized object + * @return A List containing all the values of the row */ @Override public Object deserialize(Writable blob) throws SerDeException { + final Text t = (Text) blob; + + if (t.getLength() == 0) { + if (!this.nullEmptyLines) { + throw new SerDeException("Encountered an empty row in the text file"); + } + final int fieldCount = + structReader.getObjectInspector().getAllStructFieldRefs().size(); + return Collections.nCopies(fieldCount, null); + } - Object row; - Text t = (Text) blob; try { - row = structReader.parseStruct(new ByteArrayInputStream((t.getBytes()), 0, t.getLength())); - return row; + return structReader.parseStruct( + new ByteArrayInputStream((t.getBytes()), 0, t.getLength())); } catch (Exception e) { - LOG.warn("Error [{}] parsing json text [{}].", e, t); + LOG.warn("Error parsing JSON text [{}].", t, e); throw new SerDeException(e); } } /** - * Given an object and object inspector pair, traverse the object - * and generate a Text representation of the object. + * Given an object and object inspector pair, traverse the object and generate + * a Text representation of the object. */ @Override public Writable serialize(Object obj, ObjectInspector objInspector) - throws SerDeException { + throws SerDeException { StringBuilder sb = new StringBuilder(); try { @@ -161,7 +207,7 @@ public Writable serialize(Object obj, ObjectInspector objInspector) appendWithQuotes(sb, columnNames.get(i)); sb.append(SerDeUtils.COLON); buildJSONString(sb, soi.getStructFieldData(obj, structFields.get(i)), - structFields.get(i).getFieldObjectInspector()); + structFields.get(i).getFieldObjectInspector()); } sb.append(SerDeUtils.RBRACE); } @@ -170,19 +216,30 @@ public Writable serialize(Object obj, ObjectInspector objInspector) LOG.warn("Error generating json text from object.", e); throw new SerDeException(e); } - return new Text(sb.toString()); + final String jsonText = sb.toString(); + LOG.debug("Serialized JSON text: {}", jsonText); + return new Text(jsonText); } - private static StringBuilder appendWithQuotes(StringBuilder sb, String value) { - return sb == null ? null : sb.append(SerDeUtils.QUOTE).append(value).append(SerDeUtils.QUOTE); + private static StringBuilder appendWithQuotes(StringBuilder sb, + String value) { + return sb == null ? null + : sb.append(SerDeUtils.QUOTE).append(value).append(SerDeUtils.QUOTE); } - // TODO : code section copied over from SerDeUtils because of non-standard json production there - // should use quotes for all field names. We should fix this there, and then remove this copy. - // See http://jackson.codehaus.org/1.7.3/javadoc/org/codehaus/jackson/JsonParser.Feature.html#ALLOW_UNQUOTED_FIELD_NAMES - // for details - trying to enable Jackson to ignore that doesn't seem to work(compilation failure - // when attempting to use that feature, so having to change the production itself. + + // TODO : code section copied over from SerDeUtils because of non-standard + // json production there + // should use quotes for all field names. We should fix this there, and then + // remove this copy. + // See + // http://jackson.codehaus.org/1.7.3/javadoc/org/codehaus/jackson/JsonParser.Feature.html#ALLOW_UNQUOTED_FIELD_NAMES + // for details - trying to enable Jackson to ignore that doesn't seem to + // work(compilation failure + // when attempting to use that feature, so having to change the production + // itself. // Also, throws IOException when Binary is detected. - private static void buildJSONString(StringBuilder sb, Object o, ObjectInspector oi) throws IOException { + private void buildJSONString(StringBuilder sb, Object o, + ObjectInspector oi) throws IOException { switch (oi.getCategory()) { case PRIMITIVE: { @@ -193,7 +250,7 @@ private static void buildJSONString(StringBuilder sb, Object o, ObjectInspector switch (poi.getPrimitiveCategory()) { case BOOLEAN: { boolean b = ((BooleanObjectInspector) poi).get(o); - sb.append(b ? "true" : "false"); + sb.append(b); break; } case BYTE: { @@ -221,53 +278,56 @@ private static void buildJSONString(StringBuilder sb, Object o, ObjectInspector break; } case STRING: { - String s = - SerDeUtils.escapeString(((StringObjectInspector) poi).getPrimitiveJavaObject(o)); + String s = SerDeUtils.escapeString( + ((StringObjectInspector) poi).getPrimitiveJavaObject(o)); appendWithQuotes(sb, s); break; } case BINARY: byte[] b = ((BinaryObjectInspector) oi).getPrimitiveJavaObject(o); - Text txt = new Text(); - txt.set(b, 0, b.length); - appendWithQuotes(sb, SerDeUtils.escapeString(txt.toString())); + Text binaryText = encodeBinary(b); + appendWithQuotes(sb, SerDeUtils.escapeString(binaryText.toString())); break; case DATE: Date d = ((DateObjectInspector) poi).getPrimitiveJavaObject(o); appendWithQuotes(sb, d.toString()); break; case TIMESTAMP: { - Timestamp t = ((TimestampObjectInspector) poi).getPrimitiveJavaObject(o); + Timestamp t = + ((TimestampObjectInspector) poi).getPrimitiveJavaObject(o); appendWithQuotes(sb, t.toString()); break; } case DECIMAL: - sb.append(((HiveDecimalObjectInspector) poi).getPrimitiveJavaObject(o)); + sb.append( + ((HiveDecimalObjectInspector) poi).getPrimitiveJavaObject(o)); break; case VARCHAR: { - String s = SerDeUtils.escapeString( - ((HiveVarcharObjectInspector) poi).getPrimitiveJavaObject(o).toString()); + String s = SerDeUtils.escapeString(((HiveVarcharObjectInspector) poi) + .getPrimitiveJavaObject(o).toString()); appendWithQuotes(sb, s); break; } case CHAR: { - //this should use HiveChar.getPaddedValue() but it's protected; currently (v0.13) + // this should use HiveChar.getPaddedValue() but it's protected; + // currently (v0.13) // HiveChar.toString() returns getPaddedValue() - String s = SerDeUtils.escapeString( - ((HiveCharObjectInspector) poi).getPrimitiveJavaObject(o).toString()); + String s = SerDeUtils.escapeString(((HiveCharObjectInspector) poi) + .getPrimitiveJavaObject(o).toString()); appendWithQuotes(sb, s); break; } default: - throw new RuntimeException("Unknown primitive type: " + poi.getPrimitiveCategory()); + throw new RuntimeException( + "Unknown primitive type: " + poi.getPrimitiveCategory()); } } break; } case LIST: { ListObjectInspector loi = (ListObjectInspector) oi; - ObjectInspector listElementObjectInspector = loi - .getListElementObjectInspector(); + ObjectInspector listElementObjectInspector = + loi.getListElementObjectInspector(); List olist = loi.getList(o); if (olist == null) { sb.append("null"); @@ -286,8 +346,8 @@ private static void buildJSONString(StringBuilder sb, Object o, ObjectInspector case MAP: { MapObjectInspector moi = (MapObjectInspector) oi; ObjectInspector mapKeyObjectInspector = moi.getMapKeyObjectInspector(); - ObjectInspector mapValueObjectInspector = moi - .getMapValueObjectInspector(); + ObjectInspector mapValueObjectInspector = + moi.getMapValueObjectInspector(); Map omap = moi.getMap(o); if (omap == null) { sb.append("null"); @@ -304,7 +364,8 @@ private static void buildJSONString(StringBuilder sb, Object o, ObjectInspector StringBuilder keyBuilder = new StringBuilder(); buildJSONString(keyBuilder, e.getKey(), mapKeyObjectInspector); String keyString = keyBuilder.toString().trim(); - if ((!keyString.isEmpty()) && (keyString.charAt(0) != SerDeUtils.QUOTE)) { + if ((!keyString.isEmpty()) + && (keyString.charAt(0) != SerDeUtils.QUOTE)) { appendWithQuotes(sb, keyString); } else { sb.append(keyString); @@ -355,10 +416,22 @@ private static void buildJSONString(StringBuilder sb, Object o, ObjectInspector } } + private Text encodeBinary(byte[] b) { + switch (this.binaryEncoding) { + case BASE64: + return new Text(Base64.getMimeEncoder().encodeToString(b)); + + case DEFAULT: + default: + Text txt = new Text(); + txt.set(b, 0, b.length); + return txt; + } + } /** - * Returns an object inspector for the specified schema that - * is capable of reading in the object representation of the JSON string + * Returns an object inspector for the specified schema that is capable of + * reading in the object representation of the JSON string */ @Override public ObjectInspector getObjectInspector() throws SerDeException { @@ -380,8 +453,11 @@ public StructTypeInfo getTypeInfo() { return rowTypeInfo; } - public void setWriteablesUsage(boolean b) { - structReader.setWritablesUsage(b); + public BinaryEncoding getBinaryEncoding() { + return binaryEncoding; } + public boolean isNullEmptyLines() { + return nullEmptyLines; + } } diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/json/BinaryEncoding.java b/serde/src/java/org/apache/hadoop/hive/serde2/json/BinaryEncoding.java new file mode 100644 index 0000000..d104862 --- /dev/null +++ b/serde/src/java/org/apache/hadoop/hive/serde2/json/BinaryEncoding.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hadoop.hive.serde2.json; + +/** + * Enums describing the available String->Bytes encoding available for JSON + * parsing. This base-64 variant is what most people would think of "the + * standard" Base64 encoding for JSON: the specific MIME content transfer + * encoding. + */ +public enum BinaryEncoding { + DEFAULT, BASE64 +} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/json/HiveJsonStructReader.java b/serde/src/java/org/apache/hadoop/hive/serde2/json/HiveJsonStructReader.java index ec4efad..ed0f8c0 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/json/HiveJsonStructReader.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/json/HiveJsonStructReader.java @@ -20,12 +20,16 @@ import java.io.IOException; import java.io.InputStream; -import java.nio.charset.CharacterCodingException; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -38,8 +42,6 @@ import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; @@ -53,350 +55,441 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.core.JsonFactory; -import com.fasterxml.jackson.core.JsonParseException; -import com.fasterxml.jackson.core.JsonParser; -import com.fasterxml.jackson.core.JsonToken; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.JsonNodeType; +import com.fasterxml.jackson.databind.node.TextNode; +import com.google.common.base.Preconditions; +/** + * JavaScript Object Notation (JSON) objects group items of possibly different + * types into a single type, much like a classic 'struct'. Each JSON object is + * surrounded by curly braces {} and is presented as key/value pairs. An item + * with the JSON object may itself be a JSON object and therefore allows for + * nested fields. This class parses JSON objects and returns each item in the + * struct in an array of Objects. + */ public class HiveJsonStructReader { - private static final Logger LOG = LoggerFactory.getLogger(HiveJsonStructReader.class); + private static final Logger LOG = + LoggerFactory.getLogger(HiveJsonStructReader.class); - private ObjectInspector oi; - private JsonFactory factory; + private final ObjectMapper objectMapper = new ObjectMapper(); + private final Map discoveredFields = new HashMap<>(); + private final Set discoveredUnknownFields = new HashSet<>(); - Set reportedUnknownFieldNames = new HashSet<>(); + private final StructObjectInspector soi; + private TimestampParser tsParser; - private static boolean ignoreUnknownFields; - private static boolean hiveColIndexParsing; + private boolean ignoreUnknownFields; + private boolean hiveColIndexParsing; + private BinaryEncoding binaryEncoding; private boolean writeablePrimitives; - private TimestampParser tsParser; - - public HiveJsonStructReader(TypeInfo t) { - this(t, new TimestampParser()); + /** + * Constructor with default the Hive default timestamp parser. + * + * @param typeInfo Type info for all the fields in the JSON object + */ + public HiveJsonStructReader(TypeInfo typeInfo) { + this(typeInfo, new TimestampParser()); } - public HiveJsonStructReader(TypeInfo t, TimestampParser tsParser) { + /** + * Constructor with default the Hive default timestamp parser. + * + * @param typeInfo Type info for all the fields in the JSON object + * @param tsParser Custom timestamp parser + */ + public HiveJsonStructReader(TypeInfo typeInfo, TimestampParser tsParser) { + this.ignoreUnknownFields = false; + this.hiveColIndexParsing = false; + this.writeablePrimitives = false; + this.binaryEncoding = BinaryEncoding.BASE64; this.tsParser = tsParser; - oi = TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(t); - factory = new JsonFactory(); + this.soi = (StructObjectInspector) TypeInfoUtils + .getStandardWritableObjectInspectorFromTypeInfo(typeInfo); } - public Object parseStruct(String text) throws JsonParseException, IOException, SerDeException { - JsonParser parser = factory.createParser(text); - return parseInternal(parser); + /** + * Parse text containing a complete JSON object. + * + * @param text The text to parse + * @return A List of Objects, one for each field in the JSON object + * @throws IOException Unable to parse the JSON text + * @throws SerDeException The SerDe is not configured correctly + */ + @SuppressWarnings("unchecked") + public List parseStruct(final String text) + throws IOException, SerDeException { + Preconditions.checkNotNull(text); + Preconditions.checkState(this.soi != null); + final JsonNode rootNode = this.objectMapper.readTree(text); + return (List) visitNode(rootNode, this.soi); } - public Object parseStruct(InputStream is) throws JsonParseException, IOException, SerDeException { - JsonParser parser = factory.createParser(is); - return parseInternal(parser); + /** + * Parse text containing a complete JSON object. + * + * @param in The InputStream to read the text from + * @return A List of Objects, one for each field in the JSON object + * @throws IOException Unable to parse the JSON text + * @throws SerDeException The SerDe is not configured correctly + */ + @SuppressWarnings("unchecked") + public List parseStruct(final InputStream in) + throws IOException, SerDeException { + Preconditions.checkNotNull(in); + Preconditions.checkState(this.soi != null); + final JsonNode rootNode = this.objectMapper.readTree(in); + return (List) visitNode(rootNode, this.soi); } - private Object parseInternal(JsonParser parser) throws SerDeException { - try { - parser.nextToken(); - Object res = parseDispatcher(parser, oi); - return res; - } catch (Exception e) { - String locationStr = parser.getCurrentLocation().getLineNr() + "," + parser.getCurrentLocation().getColumnNr(); - throw new SerDeException("at[" + locationStr + "]: " + e.getMessage(), e); + /** + * Visit a node and parse it based on the provided ObjectInspector + * + * @param rootNode The root node to process + * @param oi The ObjectInspector to use + * @return The value in this node (may be a complex type if nested) + * @throws SerDeException The SerDe is not configured correctly + */ + private Object visitNode(final JsonNode rootNode, final ObjectInspector oi) + throws SerDeException { + + if (rootNode.isNull()) { + return null; } - } - - private Object parseDispatcher(JsonParser parser, ObjectInspector oi) - throws JsonParseException, IOException, SerDeException { switch (oi.getCategory()) { case PRIMITIVE: - return parsePrimitive(parser, (PrimitiveObjectInspector) oi); + final Object value = visitLeafNode(rootNode, oi); + return optionallyWrapWritable(value, oi); case LIST: - return parseList(parser, (ListObjectInspector) oi); + return visitArrayNode(rootNode, oi); case STRUCT: - return parseStruct(parser, (StructObjectInspector) oi); + return visitStructNode(rootNode, oi); case MAP: - return parseMap(parser, (MapObjectInspector) oi); + return visitMapNode(rootNode, oi); default: - throw new SerDeException("parsing of: " + oi.getCategory() + " is not handled"); + throw new SerDeException( + "Parsing of: " + oi.getCategory() + " is not supported"); } } - private Object parseMap(JsonParser parser, MapObjectInspector oi) throws IOException, SerDeException { - - if (parser.getCurrentToken() == JsonToken.VALUE_NULL) { - parser.nextToken(); - return null; + /** + * The typical usage of this SerDe requires that it return Hadoop Writable + * objects. However, some uses of this SerDe want the return values to be Java + * primitive objects. This SerDe works explicitly in Java primitive objects + * and will wrap the objects in Writable containers if required. + * + * @param value The Java primitive object to wrap + * @param oi The ObjectInspector provides the type to wrap into + * @return A Hadoop Writable if required; otherwise the object itself + */ + private Object optionallyWrapWritable(final Object value, + final ObjectInspector oi) { + if (!writeablePrimitives) { + return value; } - Map ret = new LinkedHashMap<>(); + final PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; + final PrimitiveTypeInfo typeInfo = poi.getTypeInfo(); - if (parser.getCurrentToken() != JsonToken.START_OBJECT) { - throw new SerDeException("struct expected"); - } + return PrimitiveObjectInspectorFactory + .getPrimitiveJavaObjectInspector(typeInfo.getPrimitiveCategory()) + .getPrimitiveWritableObject(value); + } - if (!(oi.getMapKeyObjectInspector() instanceof PrimitiveObjectInspector)) { - throw new SerDeException("map key must be a primitive"); + /** + * Visit a node if it is expected to be a Map (a.k.a. JSON Object) + * + * @param rootNode The node pointing at the JSON object + * @param oi The ObjectInspector to parse the Map (must be a + * MapObjectInspector) + * @return A Java Map containing the contents of the JSON map + * @throws SerDeException The SerDe is not configured correctly + */ + private Map visitMapNode(final JsonNode rootNode, + final ObjectInspector oi) throws SerDeException { + Preconditions.checkArgument(JsonNodeType.OBJECT == rootNode.getNodeType()); + + final Map ret = new LinkedHashMap<>(); + + final ObjectInspector mapKeyInspector = + ((MapObjectInspector) oi).getMapKeyObjectInspector(); + + final ObjectInspector mapValueInspector = + ((MapObjectInspector) oi).getMapValueObjectInspector(); + + if (!(mapKeyInspector instanceof PrimitiveObjectInspector)) { + throw new SerDeException("Map key must be a primitive type"); } - PrimitiveObjectInspector keyOI = (PrimitiveObjectInspector) oi.getMapKeyObjectInspector(); - ObjectInspector valOI = oi.getMapValueObjectInspector(); - JsonToken currentToken = parser.nextToken(); - while (currentToken != null && currentToken != JsonToken.END_OBJECT) { - - if (currentToken != JsonToken.FIELD_NAME) { - throw new SerDeException("unexpected token: " + currentToken); - } - - Object key = parseMapKey(parser, keyOI); - Object val = parseDispatcher(parser, valOI); + final Iterator> it = rootNode.fields(); + while (it.hasNext()) { + final Entry field = it.next(); + final Object key = + visitNode(new TextNode(field.getKey()), mapKeyInspector); + final Object val = visitNode(field.getValue(), mapValueInspector); ret.put(key, val); - - currentToken = parser.getCurrentToken(); } - if (currentToken != null) { - parser.nextToken(); - } - return ret; - } - - private Object parseStruct(JsonParser parser, StructObjectInspector oi) - throws JsonParseException, IOException, SerDeException { - - Object[] ret = new Object[oi.getAllStructFieldRefs().size()]; - - if (parser.getCurrentToken() == JsonToken.VALUE_NULL) { - parser.nextToken(); - return null; - } - if (parser.getCurrentToken() != JsonToken.START_OBJECT) { - throw new SerDeException("struct expected"); - } - JsonToken currentToken = parser.nextToken(); - while (currentToken != null && currentToken != JsonToken.END_OBJECT) { - - switch (currentToken) { - case FIELD_NAME: - String name = parser.getCurrentName(); - try { - StructField field = null; - try { - field = getStructField(oi, name); - } catch (RuntimeException e) { - if (ignoreUnknownFields) { - if (!reportedUnknownFieldNames.contains(name)) { - LOG.warn("ignoring field:" + name); - reportedUnknownFieldNames.add(name); - } - parser.nextToken(); - skipValue(parser); - break; - } - } - if (field == null) { - throw new SerDeException("undeclared field"); - } - parser.nextToken(); - ret[field.getFieldID()] = parseDispatcher(parser, field.getFieldObjectInspector()); - } catch (Exception e) { - throw new SerDeException("struct field " + name + ": " + e.getMessage(), e); - } - break; - default: - throw new SerDeException("unexpected token: " + currentToken); - } - currentToken = parser.getCurrentToken(); - } - if (currentToken != null) { - parser.nextToken(); - } return ret; } - private StructField getStructField(StructObjectInspector oi, String name) { - if (hiveColIndexParsing) { - int colIndex = getColIndex(name); - if (colIndex >= 0) { - return oi.getAllStructFieldRefs().get(colIndex); - } - } - // FIXME: linear scan inside the below method...get a map here or something.. - return oi.getStructFieldRef(name); - } - - Pattern internalPattern = Pattern.compile("^_col([0-9]+)$"); - - private int getColIndex(String internalName) { - // The above line should have been all the implementation that - // we need, but due to a bug in that impl which recognizes - // only single-digit columns, we need another impl here. - Matcher m = internalPattern.matcher(internalName); - if (!m.matches()) { - return -1; - } else { - return Integer.parseInt(m.group(1)); - } - } - - private static void skipValue(JsonParser parser) throws JsonParseException, IOException { - - int array = 0; - int object = 0; - do { - JsonToken currentToken = parser.getCurrentToken(); - if(currentToken == JsonToken.START_ARRAY) { - array++; - } - if (currentToken == JsonToken.END_ARRAY) { - array--; - } - if (currentToken == JsonToken.START_OBJECT) { - object++; - } - if (currentToken == JsonToken.END_OBJECT) { - object--; - } - - parser.nextToken(); - - } while (array > 0 || object > 0); - - } - - private Object parseList(JsonParser parser, ListObjectInspector oi) - throws JsonParseException, IOException, SerDeException { - List ret = new ArrayList<>(); - - if (parser.getCurrentToken() == JsonToken.VALUE_NULL) { - parser.nextToken(); - return null; - } - - if (parser.getCurrentToken() != JsonToken.START_ARRAY) { - throw new SerDeException("array expected"); - } - ObjectInspector eOI = oi.getListElementObjectInspector(); - JsonToken currentToken = parser.nextToken(); - try { - while (currentToken != null && currentToken != JsonToken.END_ARRAY) { - ret.add(parseDispatcher(parser, eOI)); - currentToken = parser.getCurrentToken(); + /** + * Visit a node if it is expected to be a Struct data type (a.k.a. JSON + * Object) + * + * @param rootNode The node pointing at the JSON object + * @param oi The ObjectInspector to parse the Map (must be a + * StructObjectInspector) + * @return A primitive array of Objects, each element is an element of the + * struct + * @throws SerDeException The SerDe is not configured correctly + */ + private List visitStructNode(final JsonNode rootNode, + final ObjectInspector oi) throws SerDeException { + + Preconditions.checkArgument(JsonNodeType.OBJECT == rootNode.getNodeType()); + + final StructObjectInspector structInspector = (StructObjectInspector) oi; + + final int fieldCount = structInspector.getAllStructFieldRefs().size(); + final List ret = Arrays.asList(new Object[fieldCount]); + + final Iterator> it = rootNode.fields(); + while (it.hasNext()) { + final Entry field = it.next(); + final String fieldName = field.getKey(); + final JsonNode childNode = field.getValue(); + final StructField structField = + getStructField(structInspector, fieldName); + + // If the struct field is null it is because there is a field defined in + // the JSON object that was not defined in the table definition. Ignore. + if (structField != null) { + final Object childValue = + visitNode(childNode, structField.getFieldObjectInspector()); + ret.set(structField.getFieldID(), childValue); } - } catch (Exception e) { - throw new SerDeException("array: " + e.getMessage(), e); } - currentToken = parser.nextToken(); - return ret; } - private Object parsePrimitive(JsonParser parser, PrimitiveObjectInspector oi) - throws SerDeException, IOException { - JsonToken currentToken = parser.getCurrentToken(); - if (currentToken == null) { - return null; + /** + * Visit a node if it is expected to be a JSON Array data type (a.k.a. Hive + * Array type) + * + * @param rootNode The node pointing at the JSON object + * @param oi The ObjectInspector to parse the List (must be a + * ListObjectInspector) + * @return A Java List of Objects, each element is an element of the array + * @throws SerDeException The SerDe is not configured correctly + */ + private List visitArrayNode(final JsonNode rootNode, + final ObjectInspector oi) throws SerDeException { + Preconditions.checkArgument(JsonNodeType.ARRAY == rootNode.getNodeType()); + + final ObjectInspector loi = + ((ListObjectInspector) oi).getListElementObjectInspector(); + + final List ret = new ArrayList<>(rootNode.size()); + final Iterator it = rootNode.elements(); + + while (it.hasNext()) { + final JsonNode element = it.next(); + ret.add(visitNode(element, loi)); } - try { - switch (parser.getCurrentToken()) { - case VALUE_FALSE: - case VALUE_TRUE: - case VALUE_NUMBER_INT: - case VALUE_NUMBER_FLOAT: - case VALUE_STRING: - return getObjectOfCorrespondingPrimitiveType(parser.getValueAsString(), oi); - case VALUE_NULL: - return null; - default: - throw new SerDeException("unexpected token type: " + currentToken); - } - } finally { - parser.nextToken(); - } + return ret; } - private Object getObjectOfCorrespondingPrimitiveType(String s, PrimitiveObjectInspector oi) - throws IOException { - PrimitiveTypeInfo typeInfo = oi.getTypeInfo(); - if (writeablePrimitives) { - Converter c = ObjectInspectorConverters.getConverter(PrimitiveObjectInspectorFactory.javaStringObjectInspector, oi); - return c.convert(s); - } + /** + * Visit a node if it is expected to be a primitive value (JSON leaf node). + * + * @param leafNode The node pointing at the JSON object + * @param oi The ObjectInspector to parse the value (must be a + * PrimitiveObjectInspector) + * @return A Java primitive Object + * @throws SerDeException The SerDe is not configured correctly + */ + private Object visitLeafNode(final JsonNode leafNode, + final ObjectInspector oi) throws SerDeException { + + final PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; + final PrimitiveTypeInfo typeInfo = poi.getTypeInfo(); switch (typeInfo.getPrimitiveCategory()) { case INT: - return Integer.valueOf(s); + return Integer.valueOf(leafNode.asInt()); case BYTE: - return Byte.valueOf(s); + return Byte.valueOf((byte) leafNode.asInt()); case SHORT: - return Short.valueOf(s); + return Short.valueOf((short) leafNode.asInt()); case LONG: - return Long.valueOf(s); + return Long.valueOf(leafNode.asLong()); case BOOLEAN: - return (s.equalsIgnoreCase("true")); + return Boolean.valueOf(leafNode.asBoolean()); case FLOAT: - return Float.valueOf(s); + return Float.valueOf((float) leafNode.asDouble()); case DOUBLE: - return Double.valueOf(s); + return Double.valueOf(leafNode.asDouble()); case STRING: - return s; + return leafNode.asText(); case BINARY: - try { - String t = Text.decode(s.getBytes(), 0, s.getBytes().length); - return t.getBytes(); - } catch (CharacterCodingException e) { - LOG.warn("Error generating json binary type from object.", e); - return null; - } + return getByteValue(leafNode); case DATE: - return Date.valueOf(s); + return Date.valueOf(leafNode.asText()); case TIMESTAMP: - return tsParser.parseTimestamp(s); + return tsParser.parseTimestamp(leafNode.asText()); case DECIMAL: - return HiveDecimal.create(s); + return HiveDecimal.create(leafNode.asText()); case VARCHAR: - return new HiveVarchar(s, ((BaseCharTypeInfo) typeInfo).getLength()); + return new HiveVarchar(leafNode.asText(), + ((BaseCharTypeInfo) typeInfo).getLength()); case CHAR: - return new HiveChar(s, ((BaseCharTypeInfo) typeInfo).getLength()); + return new HiveChar(leafNode.asText(), + ((BaseCharTypeInfo) typeInfo).getLength()); + default: + throw new SerDeException("Could not convert from string to map type " + + typeInfo.getTypeName()); } - throw new IOException("Could not convert from string to map type " + typeInfo.getTypeName()); } - private Object parseMapKey(JsonParser parser, PrimitiveObjectInspector oi) throws SerDeException, IOException { - JsonToken currentToken = parser.getCurrentToken(); - if (currentToken == null) { - return null; - } + /** + * A user may configure the encoding for binary data represented as text + * within a JSON object. This method applies that encoding to the text. + * + * @param binaryNode JSON Node containing the binary data + * @return A byte array with the binary data + * @throws SerDeException The SerDe is not configured correctly + */ + private byte[] getByteValue(final JsonNode binaryNode) throws SerDeException { try { - switch (parser.getCurrentToken()) { - case FIELD_NAME: - return getObjectOfCorrespondingPrimitiveType(parser.getValueAsString(), oi); - case VALUE_NULL: - return null; - default: - throw new SerDeException("unexpected token type: " + currentToken); + switch (this.binaryEncoding) { + case DEFAULT: + final String byteText = binaryNode.textValue(); + return Text.decode(byteText.getBytes(), 0, byteText.getBytes().length) + .getBytes(StandardCharsets.UTF_8); + case BASE64: + return binaryNode.binaryValue(); } - } finally { - parser.nextToken(); + } catch (Exception e) { + throw new SerDeException("Error generating JSON binary type from record.", + e); + } + throw new SerDeException("Error generating JSON binary type from record."); + } + + /** + * Matches the JSON object's field name with the Hive data type. + * + * @param oi The ObjectInsepctor to lookup the matching in + * @param fieldName The name of the field parsed from the JSON text + * @return The meta data of regarding this field + * @throws SerDeException The SerDe is not configured correctly + */ + private StructField getStructField(final StructObjectInspector oi, + final String fieldName) throws SerDeException { + + // Ignore the field if it has been ignored before + if (this.discoveredUnknownFields.contains(fieldName)) { + return null; + } + + // Return from cache if the field has already been discovered + StructField structField = this.discoveredFields.get(fieldName); + if (structField != null) { + return structField; + } + // Otherwise attempt to discover the field + if (hiveColIndexParsing) { + int colIndex = getColIndex(fieldName); + if (colIndex >= 0) { + structField = oi.getAllStructFieldRefs().get(colIndex); + } + } + if (structField == null) { + try { + structField = oi.getStructFieldRef(fieldName); + } catch (Exception e) { + // No such field + } } + if (structField != null) { + // cache it for next time + this.discoveredFields.put(fieldName, structField); + } else { + // Tried everything and did not discover this field + if (this.ignoreUnknownFields) { + if (this.discoveredUnknownFields.add(fieldName)) { + LOG.warn("Discovered unknown field: {}. Ignoring.", fieldName); + } + } else { + throw new SerDeException( + "Field found in JSON does not match table definition: " + + fieldName); + } + } + + return structField; } - public void setIgnoreUnknownFields(boolean b) { - ignoreUnknownFields = b; + private final Pattern internalPattern = Pattern.compile("^_col([0-9]+)$"); + + /** + * Look up a column based on its index + * + * @param internalName The name of the column + * @return The index of the field or -1 if the field name does not contain its + * index number too + */ + private int getColIndex(final String internalName) { + // The above line should have been all the implementation that + // we need, but due to a bug in that impl which recognizes + // only single-digit columns, we need another impl here. + final Matcher m = internalPattern.matcher(internalName); + return m.matches() ? Integer.valueOf(m.group(1)) : -1; + } + + public void setIgnoreUnknownFields(boolean ignore) { + this.ignoreUnknownFields = ignore; } - public void enableHiveColIndexParsing(boolean b) { - hiveColIndexParsing = b; + public void enableHiveColIndexParsing(boolean indexing) { + hiveColIndexParsing = indexing; } - public void setWritablesUsage(boolean b) { - writeablePrimitives = b; + public StructObjectInspector getObjectInspector() { + return soi; } - public ObjectInspector getObjectInspector() { - return oi; + public BinaryEncoding getBinaryEncodingType() { + return binaryEncoding; } + + public void setBinaryEncoding(BinaryEncoding encoding) { + this.binaryEncoding = encoding; + } + + public boolean isWriteablePrimitives() { + return writeablePrimitives; + } + + public void setWriteablePrimitives(boolean writeablePrimitives) { + this.writeablePrimitives = writeablePrimitives; + } + + @Override + public String toString() { + return "HiveJsonStructReader [ignoreUnknownFields=" + ignoreUnknownFields + + ", hiveColIndexParsing=" + hiveColIndexParsing + + ", binaryEncodingType=" + binaryEncoding + ", writeablePrimitives=" + + writeablePrimitives + "]"; + } + } diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/TestJsonSerDe.java b/serde/src/test/org/apache/hadoop/hive/serde2/TestJsonSerDe.java new file mode 100644 index 0000000..6061563 --- /dev/null +++ b/serde/src/test/org/apache/hadoop/hive/serde2/TestJsonSerDe.java @@ -0,0 +1,179 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hadoop.hive.serde2; + +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Properties; + +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.hive.common.type.Timestamp; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.io.Text; +import org.junit.Assert; +import org.junit.Test; + +/** + * Test suite for the JSON SerDe class. + */ +public class TestJsonSerDe { + + @Test + @SuppressWarnings("unchecked") + public void testPrimativeDataTypes() throws Exception { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + props.setProperty(serdeConstants.LIST_COLUMN_TYPES, + "string,float,int,boolean,timestamp"); + props.setProperty(serdeConstants.TIMESTAMP_FORMATS, "millis"); + + JsonSerDe serde = new JsonSerDe(); + serde.initialize(null, props, false); + + final String jsonText = loadJson("simple.json"); + + final Text text = new Text(jsonText); + final List results = (List) serde.deserialize(text); + + Assert.assertNotNull(results); + Assert.assertEquals(5, results.size()); + Assert.assertEquals("giraffe", results.get(0)); + Assert.assertEquals(5.5f, results.get(1)); + Assert.assertEquals(1360, results.get(2)); + Assert.assertEquals(true, results.get(3)); + Assert.assertEquals(Timestamp.ofEpochMilli(1549751270013L), results.get(4)); + } + + // Test when JSON string is empty + + @Test + @SuppressWarnings("unchecked") + public void testArray() throws Exception { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, "list,items"); + props.setProperty(serdeConstants.LIST_COLUMN_TYPES, "string,array"); + props.setProperty(serdeConstants.TIMESTAMP_FORMATS, "millis"); + + JsonSerDe serde = new JsonSerDe(); + serde.initialize(null, props, false); + + final String jsonText = loadJson("array.json"); + + final Text text = new Text(jsonText); + final List results = (List) serde.deserialize(text); + + Assert.assertNotNull(results); + Assert.assertEquals(2, results.size()); + Assert.assertEquals("grocery", results.get(0)); + Assert.assertEquals(Arrays.asList("milk", "eggs", "bread"), results.get(1)); + } + + /** + * Test when a map has a key defined as a numeric value. Technically, JSON + * does not support this because each key in a map must be a quoted string. + * Unquoted strings (hence an int value) is allowed by Javascript, but not by + * JSON specification. For Hive, the int map key type is stored as a string + * and must be converted back into an int type. + */ + @Test + @SuppressWarnings("unchecked") + public void testMapNumericKey() throws Exception { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, "map"); + props.setProperty(serdeConstants.LIST_COLUMN_TYPES, "map"); + props.setProperty(serdeConstants.TIMESTAMP_FORMATS, "millis"); + + JsonSerDe serde = new JsonSerDe(); + serde.initialize(null, props, false); + + final String jsonText = loadJson("map_int_key.json"); + + final Text text = new Text(jsonText); + final List results = (List) serde.deserialize(text); + + Assert.assertNotNull(results); + Assert.assertEquals(1, results.size()); + + Map resultMap = (Map) results.get(0); + String value1 = resultMap.get(1); + String value2 = resultMap.get(2); + + Assert.assertEquals(2, resultMap.size()); + Assert.assertEquals("2001-01-01", value1); + Assert.assertEquals(null, value2); + } + + @Test + @SuppressWarnings("unchecked") + public void testBlankLineAllowed() throws Exception { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, "a,b,c"); + props.setProperty(serdeConstants.LIST_COLUMN_TYPES, "int,int,int"); + props.setProperty(serdeConstants.TIMESTAMP_FORMATS, "millis"); + + // This is the test parameter + props.setProperty(JsonSerDe.NULL_EMPTY_LINES, "true"); + + JsonSerDe serde = new JsonSerDe(); + serde.initialize(null, props, false); + + final Text text = new Text(""); + final List results = (List) serde.deserialize(text); + Assert.assertEquals(Arrays.asList(null, null, null), results); + } + + @Test(expected = SerDeException.class) + public void testBlankLineException() throws Exception { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, "a,b,c"); + props.setProperty(serdeConstants.LIST_COLUMN_TYPES, "int,int,int"); + props.setProperty(serdeConstants.TIMESTAMP_FORMATS, "millis"); + + // This is the test parameter + props.setProperty(JsonSerDe.NULL_EMPTY_LINES, "false"); + + JsonSerDe serde = new JsonSerDe(); + serde.initialize(null, props, false); + + serde.deserialize(new Text("")); + } + + /** + * Accepts a file name and loads it from /src/test/resource/json + * + * @param resourceName The name of the file to load + * @return A JSON string, all whitespace removed + * @throws IOException Failed to load the file + */ + private String loadJson(final String resourceName) throws IOException { + final String path = "/json/" + resourceName; + final URL url = this.getClass().getResource(path); + final File testJson = new File(url.getFile()); + final String jsonText = + FileUtils.readFileToString(testJson, StandardCharsets.UTF_8); + return jsonText.replaceAll("\\s+", ""); + } +} diff --git a/serde/src/test/resources/json/array.json b/serde/src/test/resources/json/array.json new file mode 100644 index 0000000..f0b2ac7 --- /dev/null +++ b/serde/src/test/resources/json/array.json @@ -0,0 +1,8 @@ +{ + "list": "grocery", + "items": [ + "milk", + "eggs", + "bread" + ] +} \ No newline at end of file diff --git a/serde/src/test/resources/json/map_int_key.json b/serde/src/test/resources/json/map_int_key.json new file mode 100644 index 0000000..786b6e4 --- /dev/null +++ b/serde/src/test/resources/json/map_int_key.json @@ -0,0 +1,6 @@ +{ + "map": { + "1": "2001-01-01", + "2": null + } +} \ No newline at end of file diff --git a/serde/src/test/resources/json/simple.json b/serde/src/test/resources/json/simple.json new file mode 100644 index 0000000..c35d847 --- /dev/null +++ b/serde/src/test/resources/json/simple.json @@ -0,0 +1,7 @@ +{ + "name": "giraffe", + "height": 5.5, + "weight": 1360, + "endangered": true, + "born": 1549751270013 +} \ No newline at end of file