diff --git pom.xml pom.xml index 8973c2b..e70bb2a 100644 --- pom.xml +++ pom.xml @@ -137,6 +137,7 @@ 0.9.0 0.9.0 1.2.16 + 2.3 1.9.5 2.0.0-M5 + + net.sf.opencsv + opencsv + ${opencsv.version} + + + junit junit diff --git serde/src/java/org/apache/hadoop/hive/serde2/OpenCSVSerde.java serde/src/java/org/apache/hadoop/hive/serde2/OpenCSVSerde.java new file mode 100644 index 0000000..2fc90ce --- /dev/null +++ serde/src/java/org/apache/hadoop/hive/serde2/OpenCSVSerde.java @@ -0,0 +1,206 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; + +import java.io.CharArrayReader; +import java.io.IOException; +import java.io.Reader; +import java.io.StringWriter; +import java.io.Writer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; + +import au.com.bytecode.opencsv.CSVReader; +import au.com.bytecode.opencsv.CSVWriter; + +/** + * OpenCSVSerde use opencsv to deserialize CSV format. + * Users can specify custom separator, quote or escape characters. And the default separator(\), + * quote("), and escape characters(\) are the same as the opencsv library. + * + */ +public final class OpenCSVSerde extends AbstractSerDe { + + public static final Log LOG = LogFactory.getLog(OpenCSVSerde.class.getName()); + private ObjectInspector inspector; + private String[] outputFields; + private int numCols; + private List row; + + private char separatorChar; + private char quoteChar; + private char escapeChar; + + public final String SEPARATORCHAR = "separatorChar"; + public final String QUOTECHAR = "quoteChar"; + public final String ESCAPECHAR = "escapeChar"; + + @Override + public void initialize(final Configuration conf, final Properties tbl) throws SerDeException { + + final List columnNames = Arrays.asList(tbl.getProperty(serdeConstants.LIST_COLUMNS) + .split(",")); + + numCols = columnNames.size(); + + final List columnOIs = new ArrayList(numCols); + + for (int i = 0; i < numCols; i++) { + columnOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); + } + + inspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, + columnOIs); + outputFields = new String[numCols]; + row = new ArrayList(numCols); + + for (int i = 0; i < numCols; i++) { + row.add(null); + } + + separatorChar = getProperty(tbl, SEPARATORCHAR, CSVWriter.DEFAULT_SEPARATOR); + quoteChar = getProperty(tbl, QUOTECHAR, CSVWriter.DEFAULT_QUOTE_CHARACTER); + escapeChar = getProperty(tbl, ESCAPECHAR, CSVWriter.DEFAULT_ESCAPE_CHARACTER); + } + + private char getProperty(final Properties tbl, final String property, final char def) { + final String val = tbl.getProperty(property); + + if (val != null) { + return val.charAt(0); + } + + return def; + } + + @Override + public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException { + final StructObjectInspector outputRowOI = (StructObjectInspector) objInspector; + final List outputFieldRefs = outputRowOI.getAllStructFieldRefs(); + + if (outputFieldRefs.size() != numCols) { + throw new SerDeException("Cannot serialize the object because there are " + + outputFieldRefs.size() + " fields but the table has " + numCols + " columns."); + } + + // Get all data out. + for (int c = 0; c < numCols; c++) { + final Object field = outputRowOI.getStructFieldData(obj, outputFieldRefs.get(c)); + final ObjectInspector fieldOI = outputFieldRefs.get(c).getFieldObjectInspector(); + + // The data must be of type String + final StringObjectInspector fieldStringOI = (StringObjectInspector) fieldOI; + + // Convert the field to Java class String, because objects of String type + // can be stored in String, Text, or some other classes. + outputFields[c] = fieldStringOI.getPrimitiveJavaObject(field); + } + + final StringWriter writer = new StringWriter(); + final CSVWriter csv = newWriter(writer, separatorChar, quoteChar, escapeChar); + + try { + csv.writeNext(outputFields); + csv.close(); + + return new Text(writer.toString()); + } catch (final IOException ioe) { + throw new SerDeException(ioe); + } + } + + @Override + public Object deserialize(final Writable blob) throws SerDeException { + Text rowText = (Text) blob; + + CSVReader csv = null; + try { + csv = newReader(new CharArrayReader(rowText.toString().toCharArray()), separatorChar, + quoteChar, escapeChar); + final String[] read = csv.readNext(); + + for (int i=0; i< numCols; i++) { + if (read != null && i < read.length) { + row.set(i, read[i]); + } else { + row.set(i, null); + } + } + + return row; + } catch (final Exception e) { + throw new SerDeException(e); + } finally { + if (csv != null) { + try { + csv.close(); + } catch (final Exception e) { + LOG.error("fail to close csv writer ", e); + } + } + } + } + + private CSVReader newReader(final Reader reader, char separator, char quote, char escape) { + // CSVReader will throw an exception if any of separator, quote, or escape is the same, but + // the CSV format specifies that the escape character and quote char are the same... very weird + if (CSVWriter.DEFAULT_ESCAPE_CHARACTER == escape) { + return new CSVReader(reader, separator, quote); + } else { + return new CSVReader(reader, separator, quote, escape); + } + } + + private CSVWriter newWriter(final Writer writer, char separator, char quote, char escape) { + if (CSVWriter.DEFAULT_ESCAPE_CHARACTER == escape) { + return new CSVWriter(writer, separator, quote, ""); + } else { + return new CSVWriter(writer, separator, quote, escape, ""); + } + } + + @Override + public ObjectInspector getObjectInspector() throws SerDeException { + return inspector; + } + + @Override + public Class getSerializedClass() { + return Text.class; + } + + @Override + public SerDeStats getSerDeStats() { + return null; + } +} diff --git serde/src/test/org/apache/hadoop/hive/serde2/TestOpenCSVSerde.java serde/src/test/org/apache/hadoop/hive/serde2/TestOpenCSVSerde.java new file mode 100644 index 0000000..e97bddd --- /dev/null +++ serde/src/test/org/apache/hadoop/hive/serde2/TestOpenCSVSerde.java @@ -0,0 +1,82 @@ +package org.apache.hadoop.hive.serde2; + +import java.util.List; +import java.util.Properties; + +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.io.Text; +import org.junit.Before; +import org.junit.Test; +import static org.junit.Assert.assertEquals; + +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestOpenCSVSerde { + private final OpenCSVSerde csv = new OpenCSVSerde(); + private final Properties props = new Properties(); + + @Before + public void setup() throws Exception { + props.setProperty(serdeConstants.LIST_COLUMNS, "a,b,c"); + props.setProperty(serdeConstants.LIST_COLUMN_TYPES, "string,string,string"); + } + + @Test + public void testDeserialize() throws Exception { + csv.initialize(null, props); + final Text in = new Text("hello,\"yes, okay\",1"); + + final List row = (List) csv.deserialize(in); + + assertEquals("hello", row.get(0)); + assertEquals("yes, okay", row.get(1)); + assertEquals("1", row.get(2)); + } + + + @Test + public void testDeserializeCustomSeparators() throws Exception { + props.put("separatorChar", "\t"); + props.put("quoteChar", "'"); + + csv.initialize(null, props); + + final Text in = new Text("hello\t'yes\tokay'\t1"); + final List row = (List) csv.deserialize(in); + + assertEquals("hello", row.get(0)); + assertEquals("yes\tokay", row.get(1)); + assertEquals("1", row.get(2)); + } + + @Test + public void testDeserializeCustomEscape() throws Exception { + props.put("quoteChar", "'"); + props.put("escapeChar", "\\"); + + csv.initialize(null, props); + + final Text in = new Text("hello,'yes\\'okay',1"); + final List row = (List) csv.deserialize(in); + + assertEquals("hello", row.get(0)); + assertEquals("yes'okay", row.get(1)); + assertEquals("1", row.get(2)); + } +}