diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/AbstractEncodingAwareSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/AbstractEncodingAwareSerDe.java index bc4e8d8085..7a6afa4d82 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/AbstractEncodingAwareSerDe.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/AbstractEncodingAwareSerDe.java @@ -34,7 +34,10 @@ * AbstractEncodingAwareSerDe aware the encoding from table properties, * transform data from specified charset to UTF-8 during serialize, and * transform data from UTF-8 to specified charset during deserialize. + * + * @see org.apache.hadoop.hive.serde2.text.AbstractEncodingAwareSerDe */ +@Deprecated public abstract class AbstractEncodingAwareSerDe extends AbstractSerDe { private static final Logger LOG = LoggerFactory.getLogger(AbstractEncodingAwareSerDe.class); protected Charset charset; diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/text/AbstractDelimitedTextSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/text/AbstractDelimitedTextSerDe.java new file mode 100644 index 0000000000..c39b7e86a5 --- /dev/null +++ b/serde/src/java/org/apache/hadoop/hive/serde2/text/AbstractDelimitedTextSerDe.java @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.text; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.EnumSet; +import java.util.List; +import java.util.Objects; +import java.util.Properties; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; + +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; + +/** + * A base class for any Hive serializer or deserializer (SerDe) which supports a + * text-based format that stores tabular data as a unicode string and separates + * values in each row with one or more delimiter characters. + */ +public abstract class AbstractDelimitedTextSerDe + extends AbstractEncodingAwareSerDe { + + public static final String STRICT_FIELDS_COUNT = "text.strict.fields.count"; + + private String delim; + private Splitter stringSplitter; + private Joiner stringJoiner; + + private final EnumSet features = EnumSet.noneOf(Feature.class); + + /** + * Enumeration that defines all on/off features for this SerDe. + * + */ + public enum Feature { + /** + * If this feature is enabled, and more fields are parsed from a row than is + * defined in the schema, an exception is raised. Otherwise, the data is + * silently ignored, + */ + CHECK_FIELD_COUNT + } + + @Override + public void initialize(final Configuration configuration, + final Properties tableProperties) throws SerDeException { + Objects.requireNonNull(this.delim, + "Delimiter must be set before initializing"); + + LOG.debug("Table delimiter: {}", this.delim); + + super.initialize(configuration, tableProperties); + + this.stringSplitter = + Splitter.on(delim).trimResults().limit(getColumnNames().size()); + this.stringJoiner = Joiner.on(delim); + + if (Boolean + .valueOf(tableProperties.getProperty(STRICT_FIELDS_COUNT, "false"))) { + features.add(Feature.CHECK_FIELD_COUNT); + } + } + + /** + * Serialize a string into a list of one or more fields. The fields are + * separated by an arbitrary character sequence. + * + * @param clob The unicode string to deserialize + * @return The object which was parsed from the String + * @throws SerDeException If the object cannot be serialized + */ + @Override + protected List doDeserialize(final String clob) + throws SerDeException { + Objects.requireNonNull(this.stringSplitter, + "Deserializer is not initialized"); + + LOG.trace("Splitting row on delimiter [{}]:[{}]", this.delim, clob); + + final List fields = new ArrayList<>(getColumnNames().size()); + + if (!clob.isEmpty()) { + Iterables.addAll(fields, this.stringSplitter.split(clob)); + } + + final int fieldCountDelta = getColumnNames().size() - fields.size(); + + if (this.features.contains(Feature.CHECK_FIELD_COUNT)) { + if (fieldCountDelta > 0) { + throw new SerDeException( + "Number of fields parsed from text data is less than number of " + + "columns defined in the table schema"); + } + if (Iterables.getLast(fields).contains(this.delim)) { + throw new SerDeException( + "Number of fields parsed from text data is more than number of " + + "columns defined in the table schema"); + } + } + + if (fieldCountDelta > 0) { + fields.addAll(Collections.nCopies(fieldCountDelta, null)); + } + + return Collections.unmodifiableList(fields); + } + + /** + * Given a row of data, generate a UTF-8 encoded String which contains all of + * the fields in the row separated by a delimiter. + * + * @param obj The object to serialize + * @param objInspector The ObjectInspector to reference to navigate the object + * to serialize + * @return The object serialized into a unicode String + * @throws SerDeException If the object cannot be serialized + */ + @Override + protected String doSerialize(final Object obj, + final ObjectInspector objInspector) throws SerDeException { + Objects.requireNonNull(this.stringJoiner, "Serializer is not initialized"); + + final StructObjectInspector outputRowOI = + (StructObjectInspector) objInspector; + + final List outputFieldRefs = + outputRowOI.getAllStructFieldRefs(); + + if (outputFieldRefs.size() != getColumnNames().size()) { + throw new SerDeException("Cannot serialize the object because there are " + + outputFieldRefs.size() + " fields but the table has " + + getColumnNames().size() + " columns."); + } + + final List row = new ArrayList<>(outputFieldRefs.size()); + + for (int c = 0; c < outputFieldRefs.size(); c++) { + final Object field = + outputRowOI.getStructFieldData(obj, outputFieldRefs.get(c)); + final ObjectInspector fieldOI = + outputFieldRefs.get(c).getFieldObjectInspector(); + + // The data must be of type String + final StringObjectInspector fieldStringOI = + (StringObjectInspector) fieldOI; + + // Convert the field to Java class String, because objects of String type + // can be stored in String, Text, or some other classes. + row.add(fieldStringOI.getPrimitiveJavaObject(field)); + } + + return this.stringJoiner.join(row); + } + + /** + * Set the delimiter to be used to separate the fields in each row. The + * delimiter must be set before calling the + * {@link #initialize(Configuration, Properties)} method. + * + * @param delim Delimiter used to separate each field + */ + public void setDelim(final String delim) { + this.delim = delim; + } + + /** + * Get the currently configured delimiter used to separate fields in each row. + * + * @return The currently configured delimiter + */ + public String getDelim() { + return delim; + } + +} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/text/AbstractEncodingAwareSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/text/AbstractEncodingAwareSerDe.java new file mode 100644 index 0000000000..117325d0db --- /dev/null +++ b/serde/src/java/org/apache/hadoop/hive/serde2/text/AbstractEncodingAwareSerDe.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.text; + +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.Properties; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; + +/** + * AbstractEncodingAwareSerDe aware the encoding from table properties, + * transform data from specified charset to UTF-8 during serialize, and + * transform data from UTF-8 to specified charset during deserialize. + */ +public abstract class AbstractEncodingAwareSerDe extends AbstractTextSerDe { + + protected Charset charset; + + @Override + public void initialize(final Configuration configuration, + final Properties tableProperties) throws SerDeException { + super.initialize(configuration, tableProperties); + this.charset = Charset.forName(tableProperties.getProperty( + serdeConstants.SERIALIZATION_ENCODING, StandardCharsets.UTF_8.name())); + if (this.charset.equals(StandardCharsets.ISO_8859_1) + || this.charset.equals(StandardCharsets.US_ASCII)) { + LOG.warn("The data may not be properly converted to target charset " + + charset); + } + } + + /** + * Deserialize an object out of a Writable blob. This class assumes that the + * data is stored in a text format, so the blob must be of type {@link Text}. + * In most cases, the return value of this function will be constant since the + * function will reuse the returned object. If the client wants to keep a copy + * of the object, the client needs to clone the returned value by calling + * ObjectInspectorUtils.getStandardObject(). + * + * This class can be initialized with the character matching the source data. + * This method will convert the provided {@link Text} to UTF-8 from the + * specified character set before being processed. + * + * @param blob The Writable object containing a serialized object + * @return A Java object representing the contents in the blob. + * @throws SerDeException If the object cannot be deserialized + */ + @Override + public Object deserialize(final Writable blob) throws SerDeException { + final Text text = (Text) blob; + final Writable clob = (StandardCharsets.UTF_8.equals(this.charset)) ? text + : transformToUTF8(text); + return super.deserialize(clob); + } + + /** + * Serialize an object by navigating inside the Object with the + * ObjectInspector. In most cases, the return value of this function will be + * constant since the function will reuse the Writable object. If the client + * wants to keep a copy of the Writable, the client needs to clone the + * returned value. + * + * This class can be initialized with the character matching the source data. + * This method will convert the {@link Text} provided by subclasses from UTF-8 + * to the specified character set of the data. + * + * @param obj The object to serialize + * @param objInspector The ObjectInspector to reference to navigate the object + * to serialize + * @return The object serialized into a single {@link Text} object + * @throws SerDeException If the object cannot be serialized + */ + @Override + public Writable serialize(Object obj, ObjectInspector objInspector) + throws SerDeException { + final Text text = (Text) super.serialize(obj, objInspector); + final Writable result = (StandardCharsets.UTF_8.equals(this.charset)) ? text + : transformFromUTF8(text, this.charset); + return result; + } + + /** + * Transform Text data from UTF-8 to another charset. + * + * @param text The unicode text to encode + * @param cs The target character set of the text + * @return The text encoded in the specified character set + */ + private Text transformFromUTF8(final Text text, final Charset cs) { + return new Text(new String(text.getBytes(), 0, text.getLength(), cs)); + } + + /** + * Transform Writable data from its native charset to UTF-8. + * + * @param text The text to encode + * @return The text encoded as UTF-8 + */ + private Text transformToUTF8(final Text text) { + return new Text(new String(text.getBytes(), 0, text.getLength(), + StandardCharsets.UTF_8)); + } +} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/text/AbstractRegexTextSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/text/AbstractRegexTextSerDe.java new file mode 100644 index 0000000000..938ff33b1b --- /dev/null +++ b/serde/src/java/org/apache/hadoop/hive/serde2/text/AbstractRegexTextSerDe.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.text; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.Properties; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; + +/** + * A base class Hive Serializer-Deserializer (SerDe) that supports regular + * expression (regex) for parsing text files. The regular expression must + * specify one or more capturing groups. If a line of text is processed that + * generates a number of capture groups that is different than the number of + * columns defined in the table, a {@link SerDeException} is thrown.
+ *
+ * This SerDe does not support serializing table data into rows of text and will + * throw a {@link SerDeException} if it is used in that way. + * + * @see Pattern + * @see Matcher + */ +public abstract class AbstractRegexTextSerDe + extends AbstractEncodingAwareSerDe { + + private Pattern pattern; + + @Override + public void initialize(final Configuration configuration, + final Properties tableProperties) throws SerDeException { + Objects.requireNonNull(this.pattern); + super.initialize(configuration, tableProperties); + } + + @Override + protected String doSerialize(final Object obj, + final ObjectInspector objInspector) throws SerDeException { + throw new UnsupportedOperationException( + getClass() + " does not support serialization"); + } + + /** + * Based on the provided regex, deserialize a UTF-8 encoded string into a list + * of strings. + * + * @param clob The unicode string to deserialize + * @return A list of strings pulled from the value provided + * @throws SerDeException If the number of capture groups generated by + * applying the regex to the clob value provided does not equal the + * number of columns defined in the Hive table + */ + @Override + protected List doDeserialize(final String clob) + throws SerDeException { + + LOG.trace("Splitting with regex [{}]:[{}]", this.pattern, clob); + + final int columnCount = getColumnNames().size(); + + final List fields = new ArrayList<>(columnCount); + + if (!clob.isEmpty()) { + final Matcher matcher = this.pattern.matcher(clob); + if (matcher.matches()) { + for (int i = 1; i <= matcher.groupCount(); i++) { + final String group = matcher.group(i); + LOG.trace("Found a match: {}]", group); + fields.add(group); + } + } + } + + final int fieldCountDelta = getColumnNames().size() - fields.size(); + + LOG.debug("Expected number of fields: {} [delta:{}]", columnCount, + fieldCountDelta); + + if (fieldCountDelta > 0) { + throw new SerDeException( + "Number of fields parsed from text data is less than number of " + + "columns defined in the table schema"); + } + + if (fieldCountDelta < 0) { + throw new SerDeException( + "Number of fields parsed from text data is more than number of " + + "columns defined in the table schema. Data would be lost."); + } + + return Collections.unmodifiableList(fields); + } + + public Pattern getPattern() { + return pattern; + } + + public void setPattern(Pattern pattern) { + this.pattern = pattern; + } +} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/text/AbstractTextSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/text/AbstractTextSerDe.java new file mode 100644 index 0000000000..649fb1c6c4 --- /dev/null +++ b/serde/src/java/org/apache/hadoop/hive/serde2/text/AbstractTextSerDe.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.text; + +import java.util.Arrays; +import java.util.Collections; +import java.util.EnumSet; +import java.util.List; +import java.util.Objects; +import java.util.Properties; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.AbstractSerDe; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.SerDeStats; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A base class for any Hive serializer or deserializer (SerDe) which supports a + * text-based format. + */ +public abstract class AbstractTextSerDe extends AbstractSerDe { + + public static final String IGNORE_EMPTY_LINES = "text.ignore.empty.line"; + + protected final Logger LOG = LoggerFactory.getLogger(getClass()); + + private List columnNames = Collections.emptyList(); + + private ObjectInspector objectInspector; + + private final Text cachedWritable = new Text(); + + private final EnumSet features = EnumSet.noneOf(Feature.class); + + /** + * Enumeration that defines all on/off features for this SerDe. + *
    + *
  • {@link #ERROR_ON_BLANK_LINE}
  • + *
+ */ + public enum Feature { + /** + * If this feature is enabled, an exception is raised if the serializer or + * deserializer encounters an empty line in the text file. + */ + ERROR_ON_BLANK_LINE + } + + @Override + public void initialize(final Configuration configuration, + final Properties tableProperties) throws SerDeException { + + LOG.debug("Table Configuration: {}", configuration); + LOG.debug("Table Properties: {}", tableProperties); + + this.columnNames = Collections.unmodifiableList(Arrays.asList( + tableProperties.getProperty(serdeConstants.LIST_COLUMNS).split(","))); + + final List objectInspectors = Collections + .unmodifiableList(Collections.nCopies(this.columnNames.size(), + PrimitiveObjectInspectorFactory.javaStringObjectInspector)); + + this.objectInspector = ObjectInspectorFactory + .getStandardStructObjectInspector(this.columnNames, objectInspectors); + + if (!Boolean + .valueOf(tableProperties.getProperty(IGNORE_EMPTY_LINES, "true"))) { + features.add(Feature.ERROR_ON_BLANK_LINE); + } + } + + /** + * Deserialize an object out of a Writable blob. This class assumes that the + * data is stored in a text format, so the blob must be of type {@link Text}. + * In most cases, the return value of this function will be constant since the + * function will reuse the returned object. If the client wants to keep a copy + * of the object, the client needs to clone the returned value by calling + * ObjectInspectorUtils.getStandardObject(). + * + * @param blob The Writable object containing a serialized object + * @return A Java object representing the contents in the blob. + * @throws SerDeException If the object cannot be deserialized + */ + @Override + public Object deserialize(final Writable blob) throws SerDeException { + Objects.requireNonNull(blob, "Cannot deserialize null BLOB"); + + final Text rowText = (Text) blob; + + LOG.trace("Row data: [{}]", rowText); + + if (features.contains(Feature.ERROR_ON_BLANK_LINE) + && rowText.getLength() == 0) { + throw new SerDeException("Text data contains blank line"); + } + + return doDeserialize(rowText.toString()); + } + + /** + * Serialize an object by navigating inside the Object with the + * ObjectInspector. In most cases, the return value of this function will be + * constant since the function will reuse the Writable object. If the client + * wants to keep a copy of the Writable, the client needs to clone the + * returned value. + * + * @param obj The object to serialize + * @param objInspector The ObjectInspector to reference to navigate the object + * to serialize + * @return The object serialized into a single {@link Text} object + * @throws SerDeException If the object cannot be serialized + */ + @Override + public Writable serialize(final Object obj, + final ObjectInspector objInspector) throws SerDeException { + Objects.requireNonNull(obj, "Cannot serialize null object"); + Objects.requireNonNull(objInspector, "objInspector cannot be null"); + cachedWritable.set(doSerialize(obj, objInspector)); + return cachedWritable; + } + + /** + * Subclasses can override this method to allow for special serialization. The + * returned String is always expected to be UTF-8 encoded. + * + * @param obj The object to serialize + * @param objInspector The ObjectInspector to reference to navigate the object + * to serialize + * @return The object serialized into a unicode String + * @throws SerDeException If the object cannot be serialized + */ + protected abstract String doSerialize(Object obj, + ObjectInspector objInspector) throws SerDeException; + + /** + * Subclasses can override this method to allow for special deserialization. + * The String argument is always expected to be UTF-8 encoded. + * + * @param clob The unicode string to deserialize + * @return The object which was parsed from the String + * @throws SerDeException If the object cannot be serialized + */ + protected abstract Object doDeserialize(String clob) throws SerDeException; + + /** + * Get the names of the columns defined in this table. + * + * @return A list of column names + */ + public List getColumnNames() { + return this.columnNames; + } + + @Override + public ObjectInspector getObjectInspector() throws SerDeException { + return this.objectInspector; + } + + /** + * The serialized type is always {@link Text} for any sub-classes. + */ + @Override + public Class getSerializedClass() { + return Text.class; + } + + /** + * There are no stats for most text-base data, but sub-classes may wish to + * override this method with their own implementation. + */ + @Override + public SerDeStats getSerDeStats() { + return null; + } + +} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/text/CsvDelimitedTextSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/text/CsvDelimitedTextSerDe.java new file mode 100644 index 0000000000..bd79815f35 --- /dev/null +++ b/serde/src/java/org/apache/hadoop/hive/serde2/text/CsvDelimitedTextSerDe.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.text; + +import java.util.Properties; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.SerDeSpec; + +/** + * A Hive Serializer-Deserializer (SerDe) that supports comma-separated values + * (CSV) files. + * + * A CSV file is a simple, widely supported, text format for storing data in a + * tabular structure. Each record in the table is one line of a text file. Each + * field value of a record is separated from the next by a comma character. + * + * This SerDe can be configured to handle different error scenarios. + *
    + *
  1. Blank line in the text file
  2. + *
  3. Too many fields in the record
  4. + *
  5. Too few fields in the record
  6. + *
+ * + * The default behavior of this class is to be tolerant of such issues to avoid + * query failures. Issues are handled in a reasonable way. All data is presented + * to the user even if it is invalid:
+ *
+ *
    + *
  1. If a blank line is encountered, all fields are set to a null value
  2. + *
  3. If there are too many fields, the extra fields are appended to the final + * value
  4. + *
  5. If there are too few fields, all remaining fields are set to null + * value
  6. + *
+ * + * Examples + * + *
+ * "1,2,3"   = ["1","2","3"]
+ * "1,2,"    = ["1","2",null]
+ * ""        = [null,null,null]
+ * "1,2,3,4" = ["1","2","3,4"]
+ * 
+ * + * Several configurations exist to cause an exception to be raised if one of + * these situations is encountered. + * + * @see CsvDelimitedTextSerDe + * @see AbstractDelimitedTextSerDe + */ +@SerDeSpec(schemaProps = { serdeConstants.LIST_COLUMNS, + serdeConstants.SERIALIZATION_ENCODING, AbstractTextSerDe.IGNORE_EMPTY_LINES, + AbstractDelimitedTextSerDe.STRICT_FIELDS_COUNT }) +public class CsvDelimitedTextSerDe extends AbstractDelimitedTextSerDe { + + private static final String DELIM = ","; + + @Override + public void initialize(final Configuration configuration, + final Properties tableProperties) throws SerDeException { + super.setDelim(DELIM); + super.initialize(configuration, tableProperties); + } + +} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/text/DelimitedTextSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/text/DelimitedTextSerDe.java new file mode 100644 index 0000000000..1a4300eedd --- /dev/null +++ b/serde/src/java/org/apache/hadoop/hive/serde2/text/DelimitedTextSerDe.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.text; + +import java.util.Properties; + +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.SerDeSpec; + +/** + * A Hive Serializer-Deserializer (SerDe) that supports delimited-separated + * values (DSV) files. The delimited used is supplied by the user with the table + * definition. + * + * A DSV file is a simple, widely supported, text format for storing data in a + * tabular structure. Each record in the table is one line of a text file. Each + * field value of a record is separated from the next by an arbitrary string of + * characters. + * + * This SerDe can be configured to handle different error scenarios. + *
    + *
  1. Blank line in the text file
  2. + *
  3. Too many fields in the record
  4. + *
  5. Too few fields in the record
  6. + *
+ * + * The default behavior of this class is to be tolerant of such issues to avoid + * query failures. Issues are handled in a reasonable way. All data is presented + * to the user even if it is invalid:
+ *
+ *
    + *
  1. If a blank line is encountered, all fields are set to a null value
  2. + *
  3. If there are too many fields, the extra fields are appended to the final + * value
  4. + *
  5. If there are too few fields, all remaining fields are set to null + * value
  6. + *
+ * + * Examples + * + *
+ * "1,2,3"   = ["1","2","3"]
+ * "1,2,"    = ["1","2",null]
+ * ""        = [null,null,null]
+ * "1,2,3,4" = ["1","2","3,4"]
+ * 
+ * + * Several configurations exist to cause an exception to be raised if one of + * these situations is encountered. + * + * @see CsvDelimitedTextSerDe + * @see AbstractDelimitedTextSerDe + */ +@SerDeSpec(schemaProps = { serdeConstants.LIST_COLUMNS, + serdeConstants.SERIALIZATION_ENCODING, serdeConstants.FIELD_DELIM, + AbstractTextSerDe.IGNORE_EMPTY_LINES, + AbstractDelimitedTextSerDe.STRICT_FIELDS_COUNT }) +public class DelimitedTextSerDe extends AbstractDelimitedTextSerDe { + + @Override + public void initialize(Configuration configuration, + Properties tableProperties) throws SerDeException { + final String delim = + tableProperties.getProperty(serdeConstants.FIELD_DELIM); + + if (StringUtils.isBlank(delim)) { + throw new SerDeException("Value cannot be blank for configuration " + + serdeConstants.FIELD_DELIM); + } + + super.setDelim(delim); + super.initialize(configuration, tableProperties); + } + +} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/text/RegexTextSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/text/RegexTextSerDe.java new file mode 100644 index 0000000000..7fcb11ec13 --- /dev/null +++ b/serde/src/java/org/apache/hadoop/hive/serde2/text/RegexTextSerDe.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.text; + +import java.util.EnumSet; +import java.util.Properties; +import java.util.regex.Pattern; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.SerDeSpec; + +import com.google.common.base.Preconditions; + +@SerDeSpec(schemaProps = { serdeConstants.LIST_COLUMNS, + serdeConstants.SERIALIZATION_ENCODING, AbstractTextSerDe.IGNORE_EMPTY_LINES, + RegexTextSerDe.INPUT_REGEX, RegexTextSerDe.INPUT_REGEX_CASE_SENSITIVE }) +public class RegexTextSerDe extends AbstractRegexTextSerDe { + + public static final String INPUT_REGEX = "input.regex"; + public static final String INPUT_REGEX_CASE_SENSITIVE = + "input.regex.case.insensitive"; + public static final String STRICT_FIELDS_COUNT = "text.strict.fields.count"; + + private Pattern pattern; + + private final EnumSet features = EnumSet.noneOf(Feature.class); + + /** + * Enumeration that defines all on/off features for this SerDe. + *
    + *
  • {@link #CHECK_FIELD_COUNT}
  • + *
+ */ + public enum Feature { + /** + * If this feature is enable, text will be considered by the regex with + * respect for case. + */ + CASE_INSENSITIVE_REGEX + } + + @Override + public void initialize(final Configuration configuration, + final Properties tableProperties) throws SerDeException { + + if (Boolean.valueOf( + tableProperties.getProperty(INPUT_REGEX_CASE_SENSITIVE, "false"))) { + features.add(Feature.CASE_INSENSITIVE_REGEX); + } + + final String regex = tableProperties.getProperty(INPUT_REGEX, ""); + Preconditions.checkArgument(!regex.isEmpty()); + + LOG.debug("SerDe configured with regex: {}", regex); + + this.pattern = features.contains(Feature.CASE_INSENSITIVE_REGEX) + ? Pattern.compile(regex, Pattern.CASE_INSENSITIVE) + : Pattern.compile(regex); + + super.setPattern(this.pattern); + super.initialize(configuration, tableProperties); + } +} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/text/TsvDelimitedTextSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/text/TsvDelimitedTextSerDe.java new file mode 100644 index 0000000000..1ab9841eb0 --- /dev/null +++ b/serde/src/java/org/apache/hadoop/hive/serde2/text/TsvDelimitedTextSerDe.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.text; + +import java.util.Properties; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.SerDeSpec; + +/** + * A Hive Serializer-Deserializer (SerDe) that supports tab-separated values + * (TSV) files. + * + * A TSV file is a simple, widely supported, text format for storing data in a + * tabular structure. Each record in the table is one line of a text file. Each + * field value of a record is separated from the next by a tab character. + * + * This SerDe can be configured to handle different error scenarios. + *
    + *
  1. Blank line in the text file
  2. + *
  3. Too many fields in the record
  4. + *
  5. Too few fields in the record
  6. + *
+ * + * The default behavior of this class is to be tolerant of such issues to avoid + * query failures. Issues are handled in a reasonable way. All data is presented + * to the user even if it is invalid:
+ *
+ *
    + *
  1. If a blank line is encountered, all fields are set to a null value
  2. + *
  3. If there are too many fields, the extra fields are appended to the final + * value
  4. + *
  5. If there are too few fields, all remaining fields are set to null + * value
  6. + *
+ * + * Examples + * + *
+ * "1,2,3"   = ["1","2","3"]
+ * "1,2,"    = ["1","2",null]
+ * ""        = [null,null,null]
+ * "1,2,3,4" = ["1","2","3,4"]
+ * 
+ * + * Several configurations exist to cause an exception to be raised if one of + * these situations is encountered. + * + * @see CsvDelimitedTextSerDe + * @see AbstractDelimitedTextSerDe + */ +@SerDeSpec(schemaProps = { serdeConstants.LIST_COLUMNS, + serdeConstants.SERIALIZATION_ENCODING, AbstractTextSerDe.IGNORE_EMPTY_LINES, + AbstractDelimitedTextSerDe.STRICT_FIELDS_COUNT }) +public class TsvDelimitedTextSerDe extends AbstractDelimitedTextSerDe { + + private static final String DELIM = "\t"; + + @Override + public void initialize(final Configuration configuration, + final Properties tableProperties) throws SerDeException { + super.setDelim(DELIM); + super.initialize(configuration, tableProperties); + } + +} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/text/log/CombinedLogFormatSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/text/log/CombinedLogFormatSerDe.java new file mode 100644 index 0000000000..e4811a3e3e --- /dev/null +++ b/serde/src/java/org/apache/hadoop/hive/serde2/text/log/CombinedLogFormatSerDe.java @@ -0,0 +1,99 @@ +package org.apache.hadoop.hive.serde2.text.log; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Properties; +import java.util.regex.Pattern; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.SerDeSpec; +import org.apache.hadoop.hive.serde2.text.AbstractRegexTextSerDe; +import org.apache.hadoop.hive.serde2.text.AbstractTextSerDe; + +/** + * The Combined Log Log Format is a an ASCII format, available for Web sites but + * not for FTP sites, and is the default format for Apache HTTP Server. This + * format is exactly the same as the Common Log Format, with the addition of two + * more fields. + * + * The log file entries produced will look something like this: + * + *
+ * 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /pb.gif HTTP/1.0" 200 2326 "http://www.example.com/start.html" "Mozilla/4.08 [en] (Win98; I ;Nav)"
+ * 
+ * + * The additional fields are: + *
    + *
  1. The "Referer" (sic) HTTP request header. This gives the site that the + * client reports having been referred from
  2. + *
  3. The User-Agent HTTP request header. This is the identifying information + * that the client browser reports about itself.
  4. + *
+ * + * Notes: + * + *

+ * A "hyphen" in the output indicates that the requested piece of information is + * not available and will be represented as a null value. + *

+ *

+ * All fields are terminated with a space. + *

+ *

+ * The Hive table definition must map precisely to this format or a + * {@link SerDeException} is thrown. + *

+ * + * @see https://httpd.apache.org/docs/current/logs.html#accesslog + * @see CommonLogFormatSerDe + */ +@SerDeSpec(schemaProps = { serdeConstants.LIST_COLUMNS, + serdeConstants.SERIALIZATION_ENCODING, + AbstractTextSerDe.IGNORE_EMPTY_LINES }) +public class CombinedLogFormatSerDe extends AbstractRegexTextSerDe { + + private final static String LOG_REGEX = + "^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.*)\" " + + "(\\d{3}) (\\S+) \"(\\S+)\" \"(.*)\"$"; + + private static final int EXPECTED_COL_COUNT = 9; + + @Override + public void initialize(final Configuration configuration, + final Properties tableProperties) throws SerDeException { + final Pattern pattern = Pattern.compile(LOG_REGEX); + super.setPattern(pattern); + super.initialize(configuration, tableProperties); + if (EXPECTED_COL_COUNT != getColumnNames().size()) { + throw new SerDeException( + "Schema must have 9 columns defined to match log format."); + } + } + + /** + * In the Combined Log Format, a 'null' value is represented with a dash "-". + * Use the regex to capture the field values and replace any dashes with null + * values. + * + * @param clob The unicode string to deserialize + * @return A list of strings pulled from the value provided + * @throws SerDeException If the number of capture groups generated by + * applying the regex to the clob value provided does not equal the + * number of columns defined in the Hive table + */ + @Override + protected List doDeserialize(final String clob) + throws SerDeException { + final List fields = super.doDeserialize(clob); + final List nulledFields = new ArrayList<>(fields.size()); + for (final String field : fields) { + final String newValue = ("-".equals(field)) ? null : field; + nulledFields.add(newValue); + } + return Collections.unmodifiableList(nulledFields); + } + +} \ No newline at end of file diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/text/log/CommonLogFormatSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/text/log/CommonLogFormatSerDe.java new file mode 100644 index 0000000000..88b549a688 --- /dev/null +++ b/serde/src/java/org/apache/hadoop/hive/serde2/text/log/CommonLogFormatSerDe.java @@ -0,0 +1,104 @@ +package org.apache.hadoop.hive.serde2.text.log; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Properties; +import java.util.regex.Pattern; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.SerDeSpec; +import org.apache.hadoop.hive.serde2.text.AbstractRegexTextSerDe; +import org.apache.hadoop.hive.serde2.text.AbstractTextSerDe; + +/** + * The Common Log Format is a an ASCII format, available for Web sites but not + * for FTP sites, and is the default format for Apache HTTP Server. + * + * The log file entries produced will look something like this: + * + *
+ * 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /pb.gif HTTP/1.0" 200 2326
+ * 
+ * + * The format of each log entry is as follows: + *
    + *
  1. User Address IP or domain name of the user accessing the site.
  2. + *
  3. The RFC 1413 identity of the client determined by identd on the clients + * machine. This information is highly unreliable and should almost never be + * used except on tightly controlled internal networks.
  4. + *
  5. The userid of the person requesting the document as determined by HTTP + * authentication
  6. + *
  7. The time that the request was received
  8. + *
  9. The request line from the client is given in double quotes
  10. + *
  11. This is the status code that the server sends back to the client. The + * full list of possible status codes can be found in the HTTP specification + * (RFC2616 section 10).
  12. + *
  13. The last part indicates the size of the object returned to the client, + * not including the response headers. If no content was returned to the client, + * this value will be "-".
  14. + *
+ * + * Notes: + * + *

+ * A "hyphen" in the output indicates that the requested piece of information is + * not available and will be represented as a null value. + *

+ *

+ * All fields are terminated with a space. + *

+ *

+ * The Hive table definition must map precisely to this format or a + * {@link SerDeException} is thrown. + *

+ * + * @see https://httpd.apache.org/docs/current/logs.html#accesslog + */ +@SerDeSpec(schemaProps = { serdeConstants.LIST_COLUMNS, + serdeConstants.SERIALIZATION_ENCODING, + AbstractTextSerDe.IGNORE_EMPTY_LINES }) +public class CommonLogFormatSerDe extends AbstractRegexTextSerDe { + + private final static String LOG_REGEX = "^(\\S+) (\\S+) (\\S+) " + + "\\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.*)\" (\\d{3}) (\\S+)$"; + + private static final int EXPECTED_COL_COUNT = 7; + + @Override + public void initialize(final Configuration configuration, + final Properties tableProperties) throws SerDeException { + final Pattern pattern = Pattern.compile(LOG_REGEX); + super.setPattern(pattern); + super.initialize(configuration, tableProperties); + if (EXPECTED_COL_COUNT != getColumnNames().size()) { + throw new SerDeException( + "Schema must have 7 columns defined to match log format."); + } + } + + /** + * In the Common Log Format, a 'null' value is represented with a dash "-". + * Use the regex to capture the field values and replace any dashes with null + * values. + * + * @param clob The unicode string to deserialize + * @return A list of strings pulled from the value provided + * @throws SerDeException If the number of capture groups generated by + * applying the regex to the clob value provided does not equal the + * number of columns defined in the Hive table + */ + @Override + protected List doDeserialize(final String clob) + throws SerDeException { + final List fields = super.doDeserialize(clob); + final List nulledFields = new ArrayList<>(fields.size()); + for (final String field : fields) { + final String newValue = ("-".equals(field)) ? null : field; + nulledFields.add(newValue); + } + return Collections.unmodifiableList(nulledFields); + } +} diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/text/TestCsvDelimitedTextSerDe.java b/serde/src/test/org/apache/hadoop/hive/serde2/text/TestCsvDelimitedTextSerDe.java new file mode 100644 index 0000000000..e51d3e3380 --- /dev/null +++ b/serde/src/test/org/apache/hadoop/hive/serde2/text/TestCsvDelimitedTextSerDe.java @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.text; + +import java.sql.Timestamp; +import java.time.Instant; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; +import java.util.TimeZone; + +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +/** + * Test suite for {@link CsvDelimitedTextSerDe}. + */ +public class TestCsvDelimitedTextSerDe { + + private CsvDelimitedTextSerDe serde; + + @Before + public void setup() { + this.serde = new CsvDelimitedTextSerDe(); + } + + @Test + public void testSerialize() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + + this.serde.initialize(null, props); + + ObjectInspector oi = serde.getObjectInspector(); + + Object testStructure = Arrays.asList(new Text("giraffe"), + new DoubleWritable(3.21), new IntWritable(20), + new BooleanWritable(false), new TimestampWritable( + Timestamp.from(Instant.parse("2014-10-14T12:34:56.02Z")))); + + final TimeZone previousTimeZone = TimeZone.getDefault(); + TimeZone.setDefault(TimeZone.getTimeZone("UTC")); + try { + final String result = this.serde.doSerialize(testStructure, oi); + Assert.assertEquals("giraffe,3.21,20,false,2014-10-14 12:34:56.02", + result); + } finally { + TimeZone.setDefault(previousTimeZone); + } + } + + @Test + @SuppressWarnings("unchecked") + public void testDeserializeDefault() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + + this.serde.initialize(null, props); + + final Text testText = + new Text("giraffe,3.21,20,false,2014-10-14 12:34:56.789"); + + final List result = (List) this.serde.deserialize(testText); + + Assert.assertNotNull(result); + Assert.assertEquals(5, result.size()); + Assert.assertEquals("giraffe", result.get(0)); + Assert.assertEquals("3.21", result.get(1)); + Assert.assertEquals("20", result.get(2)); + Assert.assertEquals("false", result.get(3)); + Assert.assertEquals("2014-10-14 12:34:56.789", result.get(4)); + } + + @Test + @SuppressWarnings("unchecked") + public void testDeserializeExtraField() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + + this.serde.initialize(null, props); + + final Text testText = + new Text("giraffe,3.21,20,false,2014-10-14 12:34:56.789,XXX"); + + final List result = (List) this.serde.deserialize(testText); + + Assert.assertNotNull(result); + Assert.assertEquals(5, result.size()); + Assert.assertEquals("giraffe", result.get(0)); + Assert.assertEquals("3.21", result.get(1)); + Assert.assertEquals("20", result.get(2)); + Assert.assertEquals("false", result.get(3)); + Assert.assertEquals("2014-10-14 12:34:56.789,XXX", result.get(4)); + } + + @Test + @SuppressWarnings("unchecked") + public void testDeserializeMissingField() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + + this.serde.initialize(null, props); + + final Text testText = new Text("giraffe,3.21,20,false"); + + final List result = (List) this.serde.deserialize(testText); + + Assert.assertNotNull(result); + Assert.assertEquals(5, result.size()); + Assert.assertEquals("giraffe", result.get(0)); + Assert.assertEquals("3.21", result.get(1)); + Assert.assertEquals("20", result.get(2)); + Assert.assertEquals("false", result.get(3)); + Assert.assertNull(result.get(4)); + } + + @Test + @SuppressWarnings("unchecked") + public void testDeserializeEmptyRow() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + + this.serde.initialize(null, props); + + final Text testText = new Text(""); + + final List result = (List) this.serde.deserialize(testText); + + Assert.assertNotNull(result); + Assert.assertEquals(5, result.size()); + Assert.assertNull(result.get(0)); + Assert.assertNull(result.get(1)); + Assert.assertNull(result.get(2)); + Assert.assertNull(result.get(3)); + Assert.assertNull(result.get(4)); + } + + @Test(expected = SerDeException.class) + public void testDeserializeEmptyRowRaiseException() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + props.setProperty(AbstractTextSerDe.IGNORE_EMPTY_LINES, "false"); + + this.serde.initialize(null, props); + + final Text testText = new Text(""); + + this.serde.deserialize(testText); + } + + @Test(expected = SerDeException.class) + public void testDeserializeExtraFieldRowRaiseException() + throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + props.setProperty(AbstractDelimitedTextSerDe.STRICT_FIELDS_COUNT, "true"); + + this.serde.initialize(null, props); + + final Text testText = + new Text("giraffe,3.21,20,false,2014-10-14 12:34:56.789,XXX"); + + this.serde.deserialize(testText); + } + + @Test(expected = SerDeException.class) + public void testDeserializeMissingFieldRowRaiseException() + throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + props.setProperty(AbstractDelimitedTextSerDe.STRICT_FIELDS_COUNT, "true"); + + this.serde.initialize(null, props); + + final Text testText = new Text("giraffe,3.21,20,false"); + + this.serde.deserialize(testText); + } +} diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/text/TestDelimitedTextSerDe.java b/serde/src/test/org/apache/hadoop/hive/serde2/text/TestDelimitedTextSerDe.java new file mode 100644 index 0000000000..2575eea1fd --- /dev/null +++ b/serde/src/test/org/apache/hadoop/hive/serde2/text/TestDelimitedTextSerDe.java @@ -0,0 +1,219 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.text; + +import java.sql.Timestamp; +import java.time.Instant; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; +import java.util.TimeZone; + +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +/** + * Test suite for {@link DelimitedTextSerDe}. + */ +public class TestDelimitedTextSerDe { + + private DelimitedTextSerDe serde; + + @Before + public void setup() { + this.serde = new DelimitedTextSerDe(); + } + + @Test + public void testSerialize() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + props.setProperty(serdeConstants.FIELD_DELIM, "|"); + + this.serde.initialize(null, props); + + ObjectInspector oi = serde.getObjectInspector(); + + Object testStructure = Arrays.asList(new Text("giraffe"), + new DoubleWritable(3.21), new IntWritable(20), + new BooleanWritable(false), new TimestampWritable( + Timestamp.from(Instant.parse("2014-10-14T12:34:56.02Z")))); + + final TimeZone previousTimeZone = TimeZone.getDefault(); + TimeZone.setDefault(TimeZone.getTimeZone("UTC")); + try { + final String result = this.serde.doSerialize(testStructure, oi); + Assert.assertEquals("giraffe|3.21|20|false|2014-10-14 12:34:56.02", + result); + } finally { + TimeZone.setDefault(previousTimeZone); + } + } + + @Test + @SuppressWarnings("unchecked") + public void testDeserializeDefault() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + props.setProperty(serdeConstants.FIELD_DELIM, "|"); + + this.serde.initialize(null, props); + + final Text testText = + new Text("giraffe|3.21|20|false|2014-10-14 12:34:56.789"); + + final List result = (List) this.serde.deserialize(testText); + + Assert.assertNotNull(result); + Assert.assertEquals(5, result.size()); + Assert.assertEquals("giraffe", result.get(0)); + Assert.assertEquals("3.21", result.get(1)); + Assert.assertEquals("20", result.get(2)); + Assert.assertEquals("false", result.get(3)); + Assert.assertEquals("2014-10-14 12:34:56.789", result.get(4)); + } + + @Test + @SuppressWarnings("unchecked") + public void testDeserializeExtraField() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + props.setProperty(serdeConstants.FIELD_DELIM, "|"); + + this.serde.initialize(null, props); + + final Text testText = + new Text("giraffe|3.21|20|false|2014-10-14 12:34:56.789|XXX"); + + final List result = (List) this.serde.deserialize(testText); + + Assert.assertNotNull(result); + Assert.assertEquals(5, result.size()); + Assert.assertEquals("giraffe", result.get(0)); + Assert.assertEquals("3.21", result.get(1)); + Assert.assertEquals("20", result.get(2)); + Assert.assertEquals("false", result.get(3)); + Assert.assertEquals("2014-10-14 12:34:56.789|XXX", result.get(4)); + } + + @Test + @SuppressWarnings("unchecked") + public void testDeserializeMissingField() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + props.setProperty(serdeConstants.FIELD_DELIM, "|"); + + this.serde.initialize(null, props); + + final Text testText = new Text("giraffe|3.21|20|false"); + + final List result = (List) this.serde.deserialize(testText); + + Assert.assertNotNull(result); + Assert.assertEquals(5, result.size()); + Assert.assertEquals("giraffe", result.get(0)); + Assert.assertEquals("3.21", result.get(1)); + Assert.assertEquals("20", result.get(2)); + Assert.assertEquals("false", result.get(3)); + Assert.assertNull(result.get(4)); + } + + @Test + @SuppressWarnings("unchecked") + public void testDeserializeEmptyRow() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + props.setProperty(serdeConstants.FIELD_DELIM, "|"); + + this.serde.initialize(null, props); + + final Text testText = new Text(""); + + final List result = (List) this.serde.deserialize(testText); + + Assert.assertNotNull(result); + Assert.assertEquals(5, result.size()); + Assert.assertNull(result.get(0)); + Assert.assertNull(result.get(1)); + Assert.assertNull(result.get(2)); + Assert.assertNull(result.get(3)); + Assert.assertNull(result.get(4)); + } + + @Test(expected = SerDeException.class) + public void testDeserializeEmptyRowRaiseException() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + props.setProperty(AbstractTextSerDe.IGNORE_EMPTY_LINES, "false"); + props.setProperty(serdeConstants.FIELD_DELIM, "|"); + + this.serde.initialize(null, props); + + final Text testText = new Text(""); + + this.serde.deserialize(testText); + } + + @Test(expected = SerDeException.class) + public void testDeserializeExtraFieldRowRaiseException() + throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + props.setProperty(AbstractDelimitedTextSerDe.STRICT_FIELDS_COUNT, "true"); + props.setProperty(serdeConstants.FIELD_DELIM, "|"); + + this.serde.initialize(null, props); + + final Text testText = + new Text("giraffe|3.21|20|false|2014-10-14 12:34:56.789|XXX"); + + this.serde.deserialize(testText); + } + + @Test(expected = SerDeException.class) + public void testDeserializeMissingFieldRowRaiseException() + throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + props.setProperty(AbstractDelimitedTextSerDe.STRICT_FIELDS_COUNT, "true"); + props.setProperty(serdeConstants.FIELD_DELIM, "|"); + + this.serde.initialize(null, props); + + final Text testText = new Text("giraffe|3.21|20|false"); + + this.serde.deserialize(testText); + } +} diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/text/TestRegexTextSerDe.java b/serde/src/test/org/apache/hadoop/hive/serde2/text/TestRegexTextSerDe.java new file mode 100644 index 0000000000..929b892f51 --- /dev/null +++ b/serde/src/test/org/apache/hadoop/hive/serde2/text/TestRegexTextSerDe.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.text; + +import java.util.List; +import java.util.Properties; + +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.Text; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +/** + * Test suite for {@link RegexTextSerDe}. + */ +public class TestRegexTextSerDe { + + private RegexTextSerDe serde; + + @Before + public void setup() { + this.serde = new RegexTextSerDe(); + } + + /** + * RegexSerDe does not support serialization. + */ + @Test(expected = UnsupportedOperationException.class) + public void testSerialize() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, "c1"); + props.setProperty(RegexTextSerDe.INPUT_REGEX, "(.*)"); + + this.serde.initialize(null, props); + + this.serde.serialize("test", + PrimitiveObjectInspectorFactory.javaStringObjectInspector); + } + + @Test + @SuppressWarnings("unchecked") + public void testDeserializeDefault() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, "c1,c2"); + props.setProperty(RegexTextSerDe.INPUT_REGEX, "(.*),(.*)"); + + this.serde.initialize(null, props); + + final Text testText = new Text("A,B"); + + final List result = (List) this.serde.deserialize(testText); + + Assert.assertNotNull(result); + Assert.assertEquals(2, result.size()); + Assert.assertEquals("A", result.get(0)); + Assert.assertEquals("B", result.get(1)); + } + + @Test(expected = SerDeException.class) + public void testDeserializeExtraField() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, "c1,c2"); + props.setProperty(RegexTextSerDe.INPUT_REGEX, "(A)"); + + this.serde.initialize(null, props); + + // 3 values, 2 columns + final Text testText = new Text("AAA"); + + this.serde.deserialize(testText); + } + + @Test(expected = SerDeException.class) + public void testDeserializeMissingField() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, "c1,c2"); + props.setProperty(RegexTextSerDe.INPUT_REGEX, "(.*),(.*)"); + + this.serde.initialize(null, props); + + final Text testText = new Text("A"); + + this.serde.deserialize(testText); + } +} diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/text/TestTsvDelimitedTextSerDe.java b/serde/src/test/org/apache/hadoop/hive/serde2/text/TestTsvDelimitedTextSerDe.java new file mode 100644 index 0000000000..84145af317 --- /dev/null +++ b/serde/src/test/org/apache/hadoop/hive/serde2/text/TestTsvDelimitedTextSerDe.java @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.text; + +import java.sql.Timestamp; +import java.time.Instant; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; +import java.util.TimeZone; + +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +/** + * Test suite for {@link TsvDelimitedTextSerDe}. + */ +public class TestTsvDelimitedTextSerDe { + + private TsvDelimitedTextSerDe serde; + + @Before + public void setup() { + this.serde = new TsvDelimitedTextSerDe(); + } + + @Test + public void testSerialize() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + + this.serde.initialize(null, props); + + ObjectInspector oi = serde.getObjectInspector(); + + Object testStructure = Arrays.asList(new Text("giraffe"), + new DoubleWritable(3.21), new IntWritable(20), + new BooleanWritable(false), new TimestampWritable( + Timestamp.from(Instant.parse("2014-10-14T12:34:56.02Z")))); + + final TimeZone previousTimeZone = TimeZone.getDefault(); + TimeZone.setDefault(TimeZone.getTimeZone("UTC")); + try { + final String result = this.serde.doSerialize(testStructure, oi); + Assert.assertEquals("giraffe\t3.21\t20\tfalse\t2014-10-14 12:34:56.02", + result); + } finally { + TimeZone.setDefault(previousTimeZone); + } + } + + @Test + @SuppressWarnings("unchecked") + public void testDeserializeDefault() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + + this.serde.initialize(null, props); + + final Text testText = + new Text("giraffe\t3.21\t20\tfalse\t2014-10-14 12:34:56.789"); + + final List result = (List) this.serde.deserialize(testText); + + Assert.assertNotNull(result); + Assert.assertEquals(5, result.size()); + Assert.assertEquals("giraffe", result.get(0)); + Assert.assertEquals("3.21", result.get(1)); + Assert.assertEquals("20", result.get(2)); + Assert.assertEquals("false", result.get(3)); + Assert.assertEquals("2014-10-14 12:34:56.789", result.get(4)); + } + + @Test + @SuppressWarnings("unchecked") + public void testDeserializeExtraField() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + + this.serde.initialize(null, props); + + final Text testText = + new Text("giraffe\t3.21\t20\tfalse\t2014-10-14 12:34:56.789\tXXX"); + + final List result = (List) this.serde.deserialize(testText); + + Assert.assertNotNull(result); + Assert.assertEquals(5, result.size()); + Assert.assertEquals("giraffe", result.get(0)); + Assert.assertEquals("3.21", result.get(1)); + Assert.assertEquals("20", result.get(2)); + Assert.assertEquals("false", result.get(3)); + Assert.assertEquals("2014-10-14 12:34:56.789\tXXX", result.get(4)); + } + + @Test + @SuppressWarnings("unchecked") + public void testDeserializeMissingField() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + + this.serde.initialize(null, props); + + final Text testText = new Text("giraffe\t3.21\t20\tfalse"); + + final List result = (List) this.serde.deserialize(testText); + + Assert.assertNotNull(result); + Assert.assertEquals(5, result.size()); + Assert.assertEquals("giraffe", result.get(0)); + Assert.assertEquals("3.21", result.get(1)); + Assert.assertEquals("20", result.get(2)); + Assert.assertEquals("false", result.get(3)); + Assert.assertNull(result.get(4)); + } + + @Test + @SuppressWarnings("unchecked") + public void testDeserializeEmptyRow() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + + this.serde.initialize(null, props); + + final Text testText = new Text(""); + + final List result = (List) this.serde.deserialize(testText); + + Assert.assertNotNull(result); + Assert.assertEquals(5, result.size()); + Assert.assertNull(result.get(0)); + Assert.assertNull(result.get(1)); + Assert.assertNull(result.get(2)); + Assert.assertNull(result.get(3)); + Assert.assertNull(result.get(4)); + } + + @Test(expected = SerDeException.class) + public void testDeserializeEmptyRowRaiseException() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + props.setProperty(AbstractTextSerDe.IGNORE_EMPTY_LINES, "false"); + + this.serde.initialize(null, props); + + final Text testText = new Text(""); + + this.serde.deserialize(testText); + } + + @Test(expected = SerDeException.class) + public void testDeserializeExtraFieldRowRaiseException() + throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + props.setProperty(AbstractDelimitedTextSerDe.STRICT_FIELDS_COUNT, "true"); + + this.serde.initialize(null, props); + + final Text testText = + new Text("giraffe\t3.21\t20\tfalse\t2014-10-14 12:34:56.789\tXXX"); + + this.serde.deserialize(testText); + } + + @Test(expected = SerDeException.class) + public void testDeserializeMissingFieldRowRaiseException() + throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "name,height,weight,endangered,born"); + props.setProperty(AbstractDelimitedTextSerDe.STRICT_FIELDS_COUNT, "true"); + + this.serde.initialize(null, props); + + final Text testText = new Text("giraffe\t3.21\t20\tfalse"); + + this.serde.deserialize(testText); + } +} diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/text/log/TestCombinedLogFormatSerDe.java b/serde/src/test/org/apache/hadoop/hive/serde2/text/log/TestCombinedLogFormatSerDe.java new file mode 100644 index 0000000000..bde10da63c --- /dev/null +++ b/serde/src/test/org/apache/hadoop/hive/serde2/text/log/TestCombinedLogFormatSerDe.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.text.log; + +import java.util.List; +import java.util.Properties; + +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.Text; +import org.junit.Before; +import org.junit.Test; + +import org.junit.Assert; + +/** + * Test suite for {@link CombinedLogFormatSerDe}. + */ +public class TestCombinedLogFormatSerDe { + + private CombinedLogFormatSerDe serde; + + @Before + public void setup() { + this.serde = new CombinedLogFormatSerDe(); + } + + /** + * CommonLogFormatSerDe SerDe does not support serialization. + */ + @Test(expected = UnsupportedOperationException.class) + public void testSerialize() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "c1,c2,c3,c4,c5,c6,c7,c8,c9"); + + this.serde.initialize(null, props); + + this.serde.serialize("test", + PrimitiveObjectInspectorFactory.javaStringObjectInspector); + } + + /** + * CommonLogFormatSerDe SerDe requires a predefined schema that matches the + * log format. + */ + @Test(expected = SerDeException.class) + public void testColumnCount() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, "c1,c2,c3,c4,c5,c6,c7,c8"); + + this.serde.initialize(null, props); + } + + @Test + @SuppressWarnings("unchecked") + public void testDeserializeDefault() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, + "c1,c2,c3,c4,c5,c6,c7,c8,c9"); + + this.serde.initialize(null, props); + + final Text testText = + new Text("127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] " + + "\"GET /pb.gif HTTP/1.0\" 200 2326 " + + "\"http://www.example.com/start.html\" " + + "\"Mozilla/4.08 [en] (Win98; I ;Nav)\""); + + final List result = (List) this.serde.deserialize(testText); + + Assert.assertNotNull(result); + Assert.assertEquals(9, result.size()); + Assert.assertEquals("127.0.0.1", result.get(0)); + Assert.assertNull(result.get(1)); + Assert.assertEquals("frank", result.get(2)); + Assert.assertEquals("10/Oct/2000:13:55:36 -0700", result.get(3)); + Assert.assertEquals("GET /pb.gif HTTP/1.0", result.get(4)); + Assert.assertEquals("200", result.get(5)); + Assert.assertEquals("2326", result.get(6)); + Assert.assertEquals("http://www.example.com/start.html", result.get(7)); + Assert.assertEquals("Mozilla/4.08 [en] (Win98; I ;Nav)", result.get(8)); + } +} diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/text/log/TestCommonLogFormatSerDe.java b/serde/src/test/org/apache/hadoop/hive/serde2/text/log/TestCommonLogFormatSerDe.java new file mode 100644 index 0000000000..3000842403 --- /dev/null +++ b/serde/src/test/org/apache/hadoop/hive/serde2/text/log/TestCommonLogFormatSerDe.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.text.log; + +import java.util.List; +import java.util.Properties; + +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.Text; +import org.junit.Before; +import org.junit.Test; + +import org.junit.Assert; + +/** + * Test suite for {@link CommonLogFormatSerDe}. + */ +public class TestCommonLogFormatSerDe { + + private CommonLogFormatSerDe serde; + + @Before + public void setup() { + this.serde = new CommonLogFormatSerDe(); + } + + /** + * CommonLogFormatSerDe SerDe does not support serialization. + */ + @Test(expected = UnsupportedOperationException.class) + public void testSerialize() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, "c1,c2,c3,c4,c5,c6,c7"); + + this.serde.initialize(null, props); + + this.serde.serialize("test", + PrimitiveObjectInspectorFactory.javaStringObjectInspector); + } + + /** + * CommonLogFormatSerDe SerDe requires a predefined schema that matches the + * log format. + */ + @Test(expected = SerDeException.class) + public void testColumnCount() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, "c1,c2,c3,c4,c5,c6"); + + this.serde.initialize(null, props); + } + + @Test + @SuppressWarnings("unchecked") + public void testDeserializeDefault() throws SerDeException { + Properties props = new Properties(); + props.setProperty(serdeConstants.LIST_COLUMNS, "c1,c2,c3,c4,c5,c6,c7"); + + this.serde.initialize(null, props); + + final Text testText = new Text("127.0.0.1 - frank " + + "[10/Oct/2000:13:55:36 -0700] \"GET /pb.gif HTTP/1.0\" 200 2326"); + + final List result = (List) this.serde.deserialize(testText); + + Assert.assertNotNull(result); + Assert.assertEquals(7, result.size()); + Assert.assertEquals("127.0.0.1", result.get(0)); + Assert.assertNull(result.get(1)); + Assert.assertEquals("frank", result.get(2)); + Assert.assertEquals("10/Oct/2000:13:55:36 -0700", result.get(3)); + Assert.assertEquals("GET /pb.gif HTTP/1.0", result.get(4)); + Assert.assertEquals("200", result.get(5)); + Assert.assertEquals("2326", result.get(6)); + } +} diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java index a496a593c2..64f7c1498a 100644 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java +++ b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java @@ -938,6 +938,9 @@ public static ConfVars getMetaConf(String name) { "hive.serdes.using.metastore.for.schema", "org.apache.hadoop.hive.ql.io.orc.OrcSerde," + "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe," + + "org.apache.hadoop.hive.serde2.text.CsvDelimitedTextSerde," + + "org.apache.hadoop.hive.serde2.text.TsvDelimitedTextSerde," + + "org.apache.hadoop.hive.serde2.text.DelimitedTextSerde," + "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe," + "org.apache.hadoop.hive.serde2.dynamic_type.DynamicSerDe," + "org.apache.hadoop.hive.serde2.MetadataTypedColumnsetSerDe," +