diff --git serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java index 8d3595b..b15f76e 100644 --- serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java +++ serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java @@ -66,7 +66,7 @@ public static final String HEADER_COUNT = "skip.header.line.count"; public static final String FOOTER_COUNT = "skip.footer.line.count"; - + public static final String VOID_TYPE_NAME = "void"; public static final String BOOLEAN_TYPE_NAME = "boolean"; diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyFactory.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyFactory.java index e3968a9..3969480 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyFactory.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyFactory.java @@ -220,9 +220,11 @@ public static ObjectInspector createLazyObjectInspector(TypeInfo typeInfo, byte[] separator, int separatorIndex, Text nullSequence, boolean escaped, byte escapeChar, ObjectInspectorOptions option) throws SerDeException { return createLazyObjectInspector(typeInfo, separator, separatorIndex, nullSequence, - escaped, escapeChar, false, option); + escaped, escapeChar, false,(byte)0, false, false,false, false, option); } + + /** * Create a hierarchical ObjectInspector for LazyObject with the given * typeInfo. @@ -244,9 +246,26 @@ public static ObjectInspector createLazyObjectInspector(TypeInfo typeInfo, byte[] separator, int separatorIndex, Text nullSequence, boolean escaped, byte escapeChar) throws SerDeException { return createLazyObjectInspector(typeInfo, separator, separatorIndex, nullSequence, - escaped, escapeChar, false, ObjectInspectorOptions.JAVA); + escaped, escapeChar, false,(byte)0, false, false,false, false, ObjectInspectorOptions.JAVA); } - + + + public static ObjectInspector createLazyObjectInspector(TypeInfo typeInfo, + byte[] separator, int separatorIndex, Text nullSequence, boolean escaped, + byte escapeChar, boolean extendedLiteral) throws SerDeException { + return createLazyObjectInspector(typeInfo, separator, separatorIndex, nullSequence, + escaped, escapeChar, extendedLiteral, ObjectInspectorOptions.JAVA); + } + + + public static ObjectInspector createLazyObjectInspector(TypeInfo typeInfo, + byte[] separator, int separatorIndex, Text nullSequence, boolean escaped, + byte escapeChar, boolean extendedBool, ObjectInspectorOptions option) throws SerDeException { + return createLazyObjectInspector(typeInfo, separator, separatorIndex, nullSequence, + escaped, escapeChar, false,(byte)0, false, false,false, extendedBool, option); + } + + /** * Create a hierarchical ObjectInspector for LazyObject with the given typeInfo. * @@ -263,9 +282,10 @@ public static ObjectInspector createLazyObjectInspector(TypeInfo typeInfo, */ public static ObjectInspector createLazyObjectInspector(TypeInfo typeInfo, byte[] separator, int separatorIndex, Text nullSequence, boolean escaped, - byte escapeChar, boolean extendedBooleanLiteral) throws SerDeException { + byte escapeChar,boolean quoted, byte quotechar, boolean dblQuoteAsQuote, + boolean extendedBooleanLiteral) throws SerDeException { return createLazyObjectInspector(typeInfo, separator, separatorIndex, nullSequence, escaped, - escapeChar, extendedBooleanLiteral, ObjectInspectorOptions.JAVA); + escapeChar, quoted, quotechar, false, false, dblQuoteAsQuote, extendedBooleanLiteral, ObjectInspectorOptions.JAVA); } /** @@ -284,12 +304,15 @@ public static ObjectInspector createLazyObjectInspector(TypeInfo typeInfo, */ public static ObjectInspector createLazyObjectInspector(TypeInfo typeInfo, byte[] separator, int separatorIndex, Text nullSequence, boolean escaped, - byte escapeChar, boolean extendedBooleanLiteral, ObjectInspectorOptions option) throws SerDeException { + byte escapeChar, boolean quoted, byte quotechar, boolean ltrim, boolean rtrim, + boolean dblQuoteAsQuote, boolean extendedBooleanLiteral, + ObjectInspectorOptions option) throws SerDeException { ObjectInspector.Category c = typeInfo.getCategory(); switch (c) { case PRIMITIVE: return LazyPrimitiveObjectInspectorFactory.getLazyObjectInspector( - (PrimitiveTypeInfo) typeInfo, escaped, escapeChar, extendedBooleanLiteral); + (PrimitiveTypeInfo) typeInfo, escaped, escapeChar, quoted, quotechar, + dblQuoteAsQuote, extendedBooleanLiteral); case MAP: return LazyObjectInspectorFactory.getLazySimpleMapObjectInspector( createLazyObjectInspector(((MapTypeInfo) typeInfo) @@ -321,8 +344,9 @@ public static ObjectInspector createLazyObjectInspector(TypeInfo typeInfo, return LazyObjectInspectorFactory.getLazySimpleStructObjectInspector( fieldNames, fieldObjectInspectors, LazyUtils.getSeparator(separator, separatorIndex), - nullSequence, - false, escaped, escapeChar, option); + nullSequence, + false, escaped, escapeChar, quoted, quotechar, ltrim, rtrim, + dblQuoteAsQuote, option); case UNION: UnionTypeInfo unionTypeInfo = (UnionTypeInfo) typeInfo; List lazyOIs = new ArrayList(); @@ -353,9 +377,32 @@ public static ObjectInspector createLazyObjectInspector(TypeInfo typeInfo, public static ObjectInspector createLazyStructInspector( List columnNames, List typeInfos, byte[] separators, Text nullSequence, boolean lastColumnTakesRest, boolean escaped, + byte escapeChar, boolean quoted, byte quotechar, boolean ltrim, boolean rtrim, + boolean dblQuoteAsQuote) throws SerDeException { + return createLazyStructInspector(columnNames, typeInfos, separators, + nullSequence, lastColumnTakesRest, escaped, escapeChar, + quoted, quotechar, ltrim, rtrim, + dblQuoteAsQuote, false); + } + + public static ObjectInspector createLazyStructInspector( + List columnNames, List typeInfos, byte[] separators, + Text nullSequence, boolean lastColumnTakesRest, boolean escaped, + byte escapeChar, boolean extendedBooleanLiteral) throws SerDeException { + return createLazyStructInspector(columnNames, typeInfos, separators, + nullSequence, lastColumnTakesRest, escaped, escapeChar, + false, (byte)0, false, false, + false, extendedBooleanLiteral); + } + + public static ObjectInspector createLazyStructInspector( + List columnNames, List typeInfos, byte[] separators, + Text nullSequence, boolean lastColumnTakesRest, boolean escaped, byte escapeChar) throws SerDeException { return createLazyStructInspector(columnNames, typeInfos, separators, - nullSequence, lastColumnTakesRest, escaped, escapeChar, false); + nullSequence, lastColumnTakesRest, escaped, escapeChar, + false, (byte)0, false, false, + false, false); } /** @@ -373,17 +420,19 @@ public static ObjectInspector createLazyStructInspector( public static ObjectInspector createLazyStructInspector( List columnNames, List typeInfos, byte[] separators, Text nullSequence, boolean lastColumnTakesRest, boolean escaped, - byte escapeChar, boolean extendedBooleanLiteral) throws SerDeException { + byte escapeChar, boolean quoted, byte quotechar, boolean ltrim, boolean rtrim, + boolean dblQuoteAsQuote, boolean extendedBooleanLiteral) throws SerDeException { ArrayList columnObjectInspectors = new ArrayList( typeInfos.size()); for (int i = 0; i < typeInfos.size(); i++) { columnObjectInspectors.add(LazyFactory.createLazyObjectInspector( - typeInfos.get(i), separators, 1, nullSequence, escaped, escapeChar, - extendedBooleanLiteral)); + typeInfos.get(i), separators, 1, nullSequence, escaped, escapeChar, + quoted, quotechar, dblQuoteAsQuote, extendedBooleanLiteral)); } return LazyObjectInspectorFactory.getLazySimpleStructObjectInspector( columnNames, columnObjectInspectors, separators[0], nullSequence, - lastColumnTakesRest, escaped, escapeChar); + lastColumnTakesRest, escaped, escapeChar, quoted, quotechar, ltrim, rtrim, + dblQuoteAsQuote); } /** diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyHiveChar.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyHiveChar.java index 3799c7c..6b2c9b1 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyHiveChar.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyHiveChar.java @@ -55,12 +55,17 @@ public void setValue(LazyHiveChar copy) { @Override public void init(ByteArrayRef bytes, int start, int length) { - if (oi.isEscaped()) { - Text textData = data.getTextValue(); - // This is doing a lot of copying here, this could be improved by enforcing length - // at the same time as escaping rather than as separate steps. - LazyUtils.copyAndEscapeStringDataToText(bytes.getData(), start, length, - oi.getEscapeChar(),textData); + if (oi.isEscaped() || oi.isQuoted()) { + byte escapeChar = oi.getEscapeChar(); + byte quoteChar = oi.getQuoteChar(); + boolean escaped = oi.isEscaped(); + boolean quoted = oi.isQuoted(); + boolean treatDblQuoteAsQuote = oi.treatDblQuotesAsQuote(); + + Text textData = data.getTextValue(); + + LazyUtils.copyAndEscapeStringDataToText(bytes.getData(),start, length, escapeChar, quoteChar, + escaped, quoted, treatDblQuoteAsQuote, textData); data.set(textData.toString(), maxLength); isNull = false; } else { diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyHiveVarchar.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyHiveVarchar.java index b4659e7..891e674 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyHiveVarchar.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyHiveVarchar.java @@ -55,12 +55,17 @@ public void setValue(LazyHiveVarchar copy) { @Override public void init(ByteArrayRef bytes, int start, int length) { - if (oi.isEscaped()) { - Text textData = data.getTextValue(); - // This is doing a lot of copying here, this could be improved by enforcing length - // at the same time as escaping rather than as separate steps. - LazyUtils.copyAndEscapeStringDataToText(bytes.getData(), start, length, - oi.getEscapeChar(),textData); + if (oi.isEscaped() || oi.isQuoted()) { + byte escapeChar = oi.getEscapeChar(); + byte quoteChar = oi.getQuoteChar(); + boolean escaped = oi.isEscaped(); + boolean quoted = oi.isQuoted(); + boolean treatDblQuoteAsQuote = oi.treatDblQuotesAsQuote(); + + Text textData = data.getTextValue(); + + LazyUtils.copyAndEscapeStringDataToText(bytes.getData(),start, length, escapeChar, quoteChar, + escaped, quoted, treatDblQuoteAsQuote, textData); data.set(textData.toString(), maxLength); isNull = false; } else { diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java index 95e30db..23454bf 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java @@ -70,7 +70,13 @@ serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST, serdeConstants.ESCAPE_CHAR, serdeConstants.SERIALIZATION_ENCODING, - LazySimpleSerDe.SERIALIZATION_EXTEND_NESTING_LEVELS}) + LazySimpleSerDe.SERIALIZATION_EXTEND_NESTING_LEVELS, + serdeConstants.QUOTE_CHAR, + LazySimpleSerDe.FIELD_ENCLOSED, + LazySimpleSerDe.FIELD_LTRIM, + LazySimpleSerDe.FIELD_RTRIM, + LazySimpleSerDe.FIELD_DOUBLEQUOTESASQUOTE + }) public class LazySimpleSerDe extends AbstractEncodingAwareSerDe { public static final Log LOG = LogFactory.getLog(LazySimpleSerDe.class @@ -78,6 +84,16 @@ public static final String SERIALIZATION_EXTEND_NESTING_LEVELS = "hive.serialization.extend.nesting.levels"; + + + public static final String FIELD_DOUBLEQUOTESASQUOTE="field.doublequotes.as.quote"; + + public static final String FIELD_ENCLOSED="field.enclosure"; + + public static final String FIELD_RTRIM="field.rtrim"; + + public static final String FIELD_LTRIM="field.ltrim"; + public static final byte[] DefaultSeparators = {(byte) 1, (byte) 2, (byte) 3}; @@ -133,6 +149,9 @@ public static byte getByte(String altValue, byte defaultVal) { Text nullSequence; TypeInfo rowTypeInfo; boolean lastColumnTakesRest; + boolean ltrimFields; + boolean rtrimFields; + List columnNames; List columnTypes; @@ -141,6 +160,10 @@ public static byte getByte(String altValue, byte defaultVal) { boolean[] needsEscape; boolean extendedBooleanLiteral; + boolean dblQuotesAsQuote; + + boolean quoted; + byte quoteChar; public List getColumnTypes() { return columnTypes; @@ -181,6 +204,25 @@ public byte getEscapeChar() { public boolean[] getNeedsEscape() { return needsEscape; } + + public boolean isQuoted() { + return quoted; + } + + public byte getQuoteChar() { + return quoteChar; + } + + public boolean ltrim() { + return ltrimFields; + } + public boolean rtrim() { + return rtrimFields; + } + + public boolean treatDblQuotesAsQuote() { + return dblQuotesAsQuote; + } } SerDeParameters serdeParams = null; @@ -207,7 +249,10 @@ public void initialize(Configuration job, Properties tbl) .getColumnNames(), serdeParams.getColumnTypes(), serdeParams .getSeparators(), serdeParams.getNullSequence(), serdeParams .isLastColumnTakesRest(), serdeParams.isEscaped(), serdeParams - .getEscapeChar(), serdeParams.extendedBooleanLiteral); + .getEscapeChar(), serdeParams.isQuoted(), + serdeParams.getQuoteChar(), serdeParams.ltrim(), serdeParams.rtrim(), + serdeParams.treatDblQuotesAsQuote(), serdeParams.extendedBooleanLiteral); + cachedLazyStruct = (LazyStruct) LazyFactory .createLazyObject(cachedObjectInspector); @@ -215,6 +260,7 @@ public void initialize(Configuration job, Properties tbl) LOG.debug(getClass().getName() + " initialized with: columnNames=" + serdeParams.columnNames + " columnTypes=" + serdeParams.columnTypes + " separator=" + Arrays.asList(serdeParams.separators) + + " ltrim=" + serdeParams.ltrimFields + " rtrim="+serdeParams.rtrimFields + " nullstring=" + serdeParams.nullString + " lastColumnTakesRest=" + serdeParams.lastColumnTakesRest); @@ -299,12 +345,34 @@ public static SerDeParameters initSerdeParams(Configuration job, serdeParams.lastColumnTakesRest = (lastColumnTakesRestString != null && lastColumnTakesRestString .equalsIgnoreCase("true")); + String ltrimString = tbl + .getProperty(LazySimpleSerDe.FIELD_LTRIM); + serdeParams.ltrimFields = (ltrimString != null && ltrimString + .equalsIgnoreCase("true")); + + String rtrimString = tbl + .getProperty(LazySimpleSerDe.FIELD_RTRIM); + serdeParams.rtrimFields = (rtrimString != null && rtrimString + .equalsIgnoreCase("true")); + + LazyUtils.extractColumnInfo(tbl, serdeParams, serdeName); // Create the LazyObject for storing the rows serdeParams.rowTypeInfo = TypeInfoFactory.getStructTypeInfo( serdeParams.columnNames, serdeParams.columnTypes); + String quoteProperty = tbl.getProperty(serdeConstants.QUOTE_CHAR); + serdeParams.quoted = (quoteProperty != null); + if (serdeParams.quoted) { + serdeParams.quoteChar = getByte(quoteProperty, (byte) '\"'); + } + + String fDblQuotesAsQuotes = tbl + .getProperty(LazySimpleSerDe.FIELD_DOUBLEQUOTESASQUOTE); + serdeParams.dblQuotesAsQuote = (fDblQuotesAsQuotes != null && fDblQuotesAsQuotes + .equalsIgnoreCase("true")); + // Get the escape information String escapeProperty = tbl.getProperty(serdeConstants.ESCAPE_CHAR); serdeParams.escaped = (escapeProperty != null); @@ -320,6 +388,13 @@ public static SerDeParameters initSerdeParams(Configuration job, for (int i = 0; i < serdeParams.separators.length; i++) { serdeParams.needsEscape[serdeParams.separators[i]] = true; } + + // also add needs escape to quotes + if (serdeParams.quoted) + { + if (serdeParams.quoteChar < 128) + serdeParams.needsEscape[serdeParams.quoteChar] = true; + } } serdeParams.extendedBooleanLiteral = job == null ? false : diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyString.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyString.java index 75b9556..45f7f2e 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyString.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyString.java @@ -37,10 +37,18 @@ public LazyString(LazyString copy) { @Override public void init(ByteArrayRef bytes, int start, int length) { - if (oi.isEscaped()) { + + if (oi.isEscaped() || oi.isQuoted()) { byte escapeChar = oi.getEscapeChar(); + byte quoteChar = oi.getQuoteChar(); + boolean escaped = oi.isEscaped(); + boolean quoted = oi.isQuoted(); + boolean treatDblQuoteAsQuote = oi.treatDblQuotesAsQuote(); + byte[] inputBytes = bytes.getData(); - LazyUtils.copyAndEscapeStringDataToText(inputBytes, start, length, escapeChar, data); + + LazyUtils.copyAndEscapeStringDataToText(inputBytes, start, length, escapeChar, quoteChar, + escaped, quoted, treatDblQuoteAsQuote, data); } else { // if the data is not escaped, simply copy the data. data.set(bytes.getData(), start, length); diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java index 588cc8c..3a826fe 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java @@ -43,6 +43,9 @@ implements StructObject, SerDeStatsStruct { private static Log LOG = LogFactory.getLog(LazyStruct.class.getName()); + + private static final byte SPACE_CHAR = (byte)' '; + private static final byte TAB_CHAR = (byte)'\t'; /** * Whether the data is already parsed or not. @@ -59,8 +62,12 @@ * Note that startPosition[arrayLength] = begin + length + 1; that makes sure * we can use the same formula to compute the length of each element of the * array. + * + * note updated to use separate arrays for start and end position to allow for trim */ int[] startPosition; + int[] dataStartPosition; + int[] dataEndPosition; /** * The fields of the struct. @@ -102,6 +109,12 @@ private void parse() { boolean lastColumnTakesRest = oi.getLastColumnTakesRest(); boolean isEscaped = oi.isEscaped(); byte escapeChar = oi.getEscapeChar(); + boolean isrtrim = oi.isRTrim(); + boolean isltrim = oi.isLTrim(); + boolean quoted = oi.isQuoted(); + byte quotechar = oi.getQuoteChar(); + boolean atFieldLeadin = true; + if (fields == null) { initLazyFields(oi.getAllStructFieldRefs()); @@ -112,15 +125,55 @@ private void parse() { int fieldByteBegin = start; int fieldByteEnd = start; byte[] bytes = this.bytes.getData(); + boolean inQuotes = false; // Go through all bytes in the byte[] while (fieldByteEnd <= structByteEnd) { - if (fieldByteEnd == structByteEnd || bytes[fieldByteEnd] == separator) { + + if (fieldByteEnd == structByteEnd || + ((bytes[fieldByteEnd] == separator) && !inQuotes)) + { // Reached the end of a field? if (lastColumnTakesRest && fieldId == fields.length - 1) { fieldByteEnd = structByteEnd; } startPosition[fieldId] = fieldByteBegin; + dataStartPosition[fieldId] = fieldByteBegin; + dataEndPosition[fieldId] = fieldByteEnd+1; + + // check for field adjustments - rtrim + if (isrtrim) + { + if ((fieldByteBegin < structByteEnd) && + (fieldByteEnd - fieldByteBegin > 0) && + (fieldByteBegin >= 0)) + { + int dataEnd = fieldByteEnd; + while ((dataEnd > fieldByteBegin) && + ((bytes[dataEnd-1] == TAB_CHAR) || + (bytes[dataEnd-1] == SPACE_CHAR))) + dataEnd--; + startPosition[fieldId] = fieldByteBegin; + dataStartPosition[fieldId] = fieldByteBegin; + dataEndPosition[fieldId] = dataEnd+1; + } + } + // check for field adjustments - quotes after seeing closing quote + if (quoted && !inQuotes) + { + if ((fieldByteBegin < structByteEnd) && + ((fieldByteEnd -1 ) < structByteEnd) && + (fieldByteEnd > 1) && + (fieldByteBegin >= 0) && + (bytes[fieldByteEnd-1] == quotechar) && + (bytes[fieldByteBegin] == quotechar)) + { + startPosition[fieldId] = fieldByteBegin+1; + dataStartPosition[fieldId] = fieldByteBegin+1; + dataEndPosition[fieldId] = fieldByteEnd; + } + + } fieldId++; if (fieldId == fields.length || fieldByteEnd == structByteEnd) { // All fields have been parsed, or bytes have been parsed. @@ -129,19 +182,42 @@ private void parse() { // For missing fields, their starting positions will all be the same, // which will make their lengths to be -1 and uncheckedGetField will // return these fields as NULLs. + for (int i = fieldId; i <= fields.length; i++) { startPosition[i] = fieldByteEnd + 1; + dataStartPosition[i] = fieldByteEnd + 1; + dataEndPosition[i] = fieldByteEnd + 1; } break; } fieldByteBegin = fieldByteEnd + 1; fieldByteEnd++; + atFieldLeadin = true; + inQuotes = false; + } else { if (isEscaped && bytes[fieldByteEnd] == escapeChar - && fieldByteEnd + 1 < structByteEnd) { + && (fieldByteEnd + 1) < structByteEnd) { // ignore the char after escape_char + if (isltrim && atFieldLeadin) + { + fieldByteBegin = fieldByteEnd; + atFieldLeadin = false; + } + fieldByteEnd += 2; } else { + if (isltrim && atFieldLeadin + && (bytes[fieldByteEnd] != SPACE_CHAR) + && (bytes[fieldByteEnd] != TAB_CHAR)) + { + atFieldLeadin = false; + fieldByteBegin = fieldByteEnd; + } + + if (quoted && (bytes[fieldByteEnd] == quotechar)) + inQuotes = !inQuotes; + fieldByteEnd++; } } @@ -178,8 +254,11 @@ protected final void initLazyFields(List fieldRefs) { // Extra element to make sure we have the same formula to compute the // length of each element of the array. startPosition = new int[fields.length + 1]; + dataStartPosition = new int[fields.length + 1]; + dataEndPosition = new int[fields.length + 1]; } + protected LazyObjectBase createLazyField(int fieldID, StructField fieldRef) throws SerDeException { return LazyFactory.createLazyObject(fieldRef.getFieldObjectInspector()); } @@ -218,8 +297,9 @@ private Object uncheckedGetField(int fieldID) { Text nullSequence = oi.getNullSequence(); // Test the length first so in most cases we avoid doing a byte[] // comparison. - int fieldByteBegin = startPosition[fieldID]; - int fieldLength = startPosition[fieldID + 1] - startPosition[fieldID] - 1; + // note data start may not be field start + int fieldByteBegin = dataStartPosition[fieldID]; + int fieldLength = dataEndPosition[fieldID ] - dataStartPosition[fieldID] - 1; if ((fieldLength < 0) || (fieldLength == nullSequence.getLength() && LazyUtils.compare(bytes .getData(), fieldByteBegin, fieldLength, nullSequence.getBytes(), @@ -351,4 +431,4 @@ public void parseMultiDelimit(byte[] rawRow, byte[] fieldDelimit) { public byte[] getBytes() { return bytes.getData(); } -} \ No newline at end of file +} diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java index 3943508..8682c77 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java @@ -411,14 +411,37 @@ static byte getSeparator(byte[] separators, int level) throws SerDeException { throw new SerDeException(msg, e); } } - - public static void copyAndEscapeStringDataToText(byte[] inputBytes, int start, int length, - byte escapeChar, Text data) { - + + /** + * Utility function to copy and escape string data and handle quoted fields + * When this function is called, outer quotes have already been removed + * Note that the enclosing char does not have to be a quote char - it can be + * non quote char as long as it does not conflict with the escape char + * @param inputBytes - the data to copy + * @param start - start location within data + * @param length -length of data to process + * @param escapeChar - single byte escape char + * @param quoteChar - single byte quote char + * @param escaped - is escape char supplied + * @param quoted - should data be treated as being embedded in quotes + * @param treatDblQuoteAsQuote - should embedded double quotes be treated as a single occurence of the same quote + * + */ + public static void copyAndEscapeStringDataToText(byte[] inputBytes, int start, + int length, byte escapeChar, byte quoteChar, boolean escaped, + boolean quoted, boolean treatDblQuoteAsQuote, Text data) { // First calculate the length of the output string int outputLength = 0; for (int i = 0; i < length; i++) { - if (inputBytes[start + i] != escapeChar) { + + // handle treating double quotes as single quote char + // note double quote char means two quote chars not the char '\"' + if (inputBytes[start + i] == quoteChar && treatDblQuoteAsQuote + && (i+1 < length) && (inputBytes[start + i+1] == quoteChar)) { + outputLength++; + i++; + } + else if (inputBytes[start + i] != escapeChar) { outputLength++; } else { outputLength++; @@ -438,7 +461,18 @@ public static void copyAndEscapeStringDataToText(byte[] inputBytes, int start, i byte[] outputBytes = data.getBytes(); for (int i = 0; i < length; i++) { byte b = inputBytes[start + i]; - if (b != escapeChar || i == length - 1) { + + // handle successive quotes + // handling depends on whether to treat double quotes as quote + // if treating double quotes as quote - then emit single quote + // otherwise emit unchanged + if (quoted && (b == quoteChar) && treatDblQuoteAsQuote + && (i+1 < length) && (inputBytes[start+i+1] == quoteChar)) + { + outputBytes[k++] = inputBytes[start + i]; + i++; + } + else if (!escaped || b != escapeChar || i == length - 1) { outputBytes[k++] = b; } else { // get the next byte diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazyObjectInspectorFactory.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazyObjectInspectorFactory.java index 1abd8a5..b1d16c8 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazyObjectInspectorFactory.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazyObjectInspectorFactory.java @@ -47,37 +47,69 @@ public static LazySimpleStructObjectInspector getLazySimpleStructObjectInspector List structFieldNames, List structFieldObjectInspectors, byte separator, Text nullSequence, boolean lastColumnTakesRest, boolean escaped, - byte escapeChar) { + byte escapeChar, boolean quoted,byte quotechar, boolean ltrim, boolean rtrim, + boolean dblQuoteAsQuote) { return getLazySimpleStructObjectInspector(structFieldNames, structFieldObjectInspectors, null, separator, nullSequence, - lastColumnTakesRest, escaped, escapeChar, ObjectInspectorOptions.JAVA); + lastColumnTakesRest, escaped, escapeChar, quoted, quotechar, ltrim, rtrim, + dblQuoteAsQuote, ObjectInspectorOptions.JAVA); } public static LazySimpleStructObjectInspector getLazySimpleStructObjectInspector( + List structFieldNames, + List structFieldObjectInspectors, byte separator, + Text nullSequence, boolean lastColumnTakesRest, boolean escaped, + byte escapeChar) { + return getLazySimpleStructObjectInspector(structFieldNames, + structFieldObjectInspectors, null, separator, nullSequence, + lastColumnTakesRest, escaped, escapeChar, false, (byte)0, false, false, + false, ObjectInspectorOptions.JAVA); + } + + + public static LazySimpleStructObjectInspector getLazySimpleStructObjectInspector( List structFieldNames, List structFieldObjectInspectors, byte separator, Text nullSequence, boolean lastColumnTakesRest, boolean escaped, - byte escapeChar, ObjectInspectorOptions option) { + byte escapeChar, boolean quoted,byte quotechar, boolean ltrim, boolean rtrim, + boolean dblQuoteAsQuote, ObjectInspectorOptions option) { return getLazySimpleStructObjectInspector(structFieldNames, structFieldObjectInspectors, null, separator, nullSequence, - lastColumnTakesRest, escaped, escapeChar, option); + lastColumnTakesRest, escaped, escapeChar, quoted, quotechar, ltrim, rtrim, + dblQuoteAsQuote,option); } public static LazySimpleStructObjectInspector getLazySimpleStructObjectInspector( List structFieldNames, List structFieldObjectInspectors, List structFieldComments, byte separator, Text nullSequence, boolean lastColumnTakesRest, - boolean escaped, byte escapeChar) { + boolean escaped, byte escapeChar, boolean quoted, byte quotechar, boolean ltrim, boolean rtrim, + boolean dblQuoteAsQuote) + { return getLazySimpleStructObjectInspector(structFieldNames, structFieldObjectInspectors, structFieldComments, separator, nullSequence, lastColumnTakesRest, escaped, escapeChar, + quoted, quotechar, ltrim, rtrim, dblQuoteAsQuote, ObjectInspectorOptions.JAVA); } public static LazySimpleStructObjectInspector getLazySimpleStructObjectInspector( + List structFieldNames, + List structFieldObjectInspectors, List structFieldComments, + byte separator, Text nullSequence, boolean lastColumnTakesRest, + boolean escaped, byte escapeChar) + { + return getLazySimpleStructObjectInspector(structFieldNames, structFieldObjectInspectors, + structFieldComments, separator, nullSequence, lastColumnTakesRest, escaped, escapeChar, + false, (byte)0, false, false, false, + ObjectInspectorOptions.JAVA); + } + + public static LazySimpleStructObjectInspector getLazySimpleStructObjectInspector( List structFieldNames, List structFieldObjectInspectors, List structFieldComments, byte separator, Text nullSequence, boolean lastColumnTakesRest, - boolean escaped,byte escapeChar, ObjectInspectorOptions option) { + boolean escaped,byte escapeChar,boolean quoted,byte quotechar, boolean ltrim, boolean rtrim, + boolean dblQuoteAsQuote, ObjectInspectorOptions option) { ArrayList signature = new ArrayList(); signature.add(structFieldNames); signature.add(structFieldObjectInspectors); @@ -87,6 +119,11 @@ public static LazySimpleStructObjectInspector getLazySimpleStructObjectInspector signature.add(Boolean.valueOf(escaped)); signature.add(Byte.valueOf(escapeChar)); signature.add(option); + signature.add(Boolean.valueOf(quoted)); + signature.add(Byte.valueOf(quotechar)); + signature.add(Boolean.valueOf(ltrim)); + signature.add(Boolean.valueOf(rtrim)); + signature.add(Boolean.valueOf(dblQuoteAsQuote)); if(structFieldComments != null) { signature.add(structFieldComments); } @@ -98,7 +135,7 @@ public static LazySimpleStructObjectInspector getLazySimpleStructObjectInspector result = new LazySimpleStructObjectInspector(structFieldNames, structFieldObjectInspectors, structFieldComments, separator, nullSequence, lastColumnTakesRest, escaped, - escapeChar); + escapeChar, quoted, quotechar, ltrim, rtrim, dblQuoteAsQuote); break; case AVRO: result = diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazySimpleStructObjectInspector.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazySimpleStructObjectInspector.java index 9611e9f..0d276e2 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazySimpleStructObjectInspector.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazySimpleStructObjectInspector.java @@ -46,7 +46,12 @@ private boolean lastColumnTakesRest; private boolean escaped; private byte escapeChar; - + private boolean quoted; + private byte quoteChar; + private boolean ltrim; + private boolean rtrim; + private boolean dblQuoteAsQuote; + protected LazySimpleStructObjectInspector() { super(); } @@ -65,30 +70,53 @@ protected LazySimpleStructObjectInspector( protected LazySimpleStructObjectInspector(List structFieldNames, List structFieldObjectInspectors, byte separator, Text nullSequence, boolean lastColumnTakesRest, boolean escaped, - byte escapeChar) { + byte escapeChar, boolean quoted, byte quotechar, boolean ltrim, boolean rtrim, + boolean dblQuoteAsQuote) { init(structFieldNames, structFieldObjectInspectors, null, separator, - nullSequence, lastColumnTakesRest, escaped, escapeChar); + nullSequence, lastColumnTakesRest, escaped, escapeChar, quoted, quotechar, ltrim, rtrim, + dblQuoteAsQuote); } public LazySimpleStructObjectInspector(List structFieldNames, List structFieldObjectInspectors, List structFieldComments, byte separator, Text nullSequence, - boolean lastColumnTakesRest, boolean escaped, byte escapeChar) { + boolean lastColumnTakesRest, boolean escaped, byte escapeChar, + boolean quoted, byte quotechar, boolean ltrim, boolean rtrim, + boolean dblQuoteAsQuote) { + init(structFieldNames, structFieldObjectInspectors, structFieldComments, + separator, nullSequence, lastColumnTakesRest, escaped, escapeChar, + quoted, quotechar, ltrim, rtrim, + dblQuoteAsQuote); + } + + public LazySimpleStructObjectInspector(List structFieldNames, + List structFieldObjectInspectors, + List structFieldComments, byte separator, Text nullSequence, + boolean lastColumnTakesRest, boolean escaped, byte escapeChar + ) { init(structFieldNames, structFieldObjectInspectors, structFieldComments, - separator, nullSequence, lastColumnTakesRest, escaped, escapeChar); + separator, nullSequence, lastColumnTakesRest, escaped, escapeChar, + false, (byte)0, false, false, false); } + protected void init(List structFieldNames, List structFieldObjectInspectors, List structFieldComments, byte separator, Text nullSequence, boolean lastColumnTakesRest, boolean escaped, - byte escapeChar) { + byte escapeChar, boolean quoted, byte quotechar, boolean ltrim, boolean rtrim, + boolean dblQuoteAsQuote) { init(structFieldNames, structFieldObjectInspectors, structFieldComments); this.separator = separator; this.nullSequence = nullSequence; this.lastColumnTakesRest = lastColumnTakesRest; this.escaped = escaped; this.escapeChar = escapeChar; + this.quoted = quoted; + this.quoteChar = quotechar; + this.ltrim = ltrim; + this.rtrim = rtrim; + this.dblQuoteAsQuote = dblQuoteAsQuote; } // With Data @@ -157,4 +185,25 @@ public byte getEscapeChar() { return escapeChar; } + public boolean isQuoted() { + return quoted; + } + + public byte getQuoteChar() { + return quoteChar; + } + + public boolean isLTrim() { + return ltrim; + } + + public boolean isRTrim() { + return rtrim; + } + + public boolean treatDblQuoteAsQuote() + { + return dblQuoteAsQuote; + } + } diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyHiveCharObjectInspector.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyHiveCharObjectInspector.java index 2b0ad15..bae9bba 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyHiveCharObjectInspector.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyHiveCharObjectInspector.java @@ -31,19 +31,30 @@ private boolean escaped; private byte escapeChar; + private boolean quoted=false; + private byte quoteChar; + private boolean dblQuotesAsQuote=false; // no-arg ctor required for Kyro public LazyHiveCharObjectInspector() { } public LazyHiveCharObjectInspector(CharTypeInfo typeInfo) { - this(typeInfo, false, (byte)0); + this(typeInfo, false, (byte)0, false, (byte)0, false); + } + + public LazyHiveCharObjectInspector(CharTypeInfo typeInfo, boolean escaped, byte escapeChar ) { + this(typeInfo, escaped, escapeChar, false, (byte)0,false ); } - public LazyHiveCharObjectInspector(CharTypeInfo typeInfo, boolean escaped, byte escapeChar) { + public LazyHiveCharObjectInspector(CharTypeInfo typeInfo, boolean escaped, byte escapeChar ,boolean quoted, byte quotechar, + boolean dblQuotesAsQuote) { super(typeInfo); this.escaped = escaped; this.escapeChar = escapeChar; + this.quoted=quoted; + this.quoteChar=quotechar; + this.dblQuotesAsQuote = dblQuotesAsQuote; } @Override @@ -79,6 +90,20 @@ public boolean isEscaped() { public byte getEscapeChar() { return escapeChar; } + + + public boolean isQuoted() { + return quoted; + } + + public byte getQuoteChar() { + return quoteChar; + } + + public boolean treatDblQuotesAsQuote() { + return dblQuotesAsQuote; + } + @Override public String toString() { diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyHiveVarcharObjectInspector.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyHiveVarcharObjectInspector.java index 8e7acce..2896ae0 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyHiveVarcharObjectInspector.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyHiveVarcharObjectInspector.java @@ -22,6 +22,7 @@ import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; import org.apache.hadoop.hive.serde2.lazy.LazyHiveVarchar; import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.BaseCharUtils; @@ -31,19 +32,30 @@ private boolean escaped; private byte escapeChar; + private boolean quoted=false; + private byte quoteChar; + private boolean dblQuotesAsQuote=false; // no-arg ctor required for Kyro public LazyHiveVarcharObjectInspector() { } public LazyHiveVarcharObjectInspector(VarcharTypeInfo typeInfo) { - this(typeInfo, false, (byte)0); + this(typeInfo, false, (byte)0, false, (byte)0, false); + } + + public LazyHiveVarcharObjectInspector(VarcharTypeInfo typeInfo, boolean escaped, byte escapeChar ) { + this(typeInfo, escaped, escapeChar, false, (byte)0,false ); } - public LazyHiveVarcharObjectInspector(VarcharTypeInfo typeInfo, boolean escaped, byte escapeChar) { + public LazyHiveVarcharObjectInspector(VarcharTypeInfo typeInfo, boolean escaped, byte escapeChar, + boolean quoted, byte quotechar, boolean dblQuotesAsQuote) { super(typeInfo); this.escaped = escaped; this.escapeChar = escapeChar; + this.quoted=quoted; + this.quoteChar=quotechar; + this.dblQuotesAsQuote = dblQuotesAsQuote; } @Override @@ -79,6 +91,19 @@ public boolean isEscaped() { public byte getEscapeChar() { return escapeChar; } + + public boolean isQuoted() { + return quoted; + } + + public byte getQuoteChar() { + return quoteChar; + } + + public boolean treatDblQuotesAsQuote() { + return dblQuotesAsQuote; + } + @Override public String toString() { diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyPrimitiveObjectInspectorFactory.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyPrimitiveObjectInspectorFactory.java index 08fec77..adbc725 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyPrimitiveObjectInspectorFactory.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyPrimitiveObjectInspectorFactory.java @@ -112,21 +112,29 @@ private LazyPrimitiveObjectInspectorFactory() { } public static AbstractPrimitiveLazyObjectInspector getLazyObjectInspector( + PrimitiveTypeInfo typeInfo, boolean escaped, byte escapeChar, boolean quoted, byte quotechar, boolean dblQuoteAsQuote) { + return getLazyObjectInspector(typeInfo, escaped, escapeChar, quoted, quotechar, dblQuoteAsQuote, false); + } + + public static AbstractPrimitiveLazyObjectInspector getLazyObjectInspector( PrimitiveTypeInfo typeInfo, boolean escaped, byte escapeChar) { - return getLazyObjectInspector(typeInfo, escaped, escapeChar, false); + return getLazyObjectInspector(typeInfo, escaped, escapeChar, false, (byte)0, false, false); } + + public static AbstractPrimitiveLazyObjectInspector getLazyObjectInspector( - PrimitiveTypeInfo typeInfo, boolean escaped, byte escapeChar, boolean extBoolean) { + PrimitiveTypeInfo typeInfo, boolean escaped, byte escapeChar,boolean quoted, byte quotechar, boolean dblQuoteAsQuote, + boolean extBoolean) { PrimitiveCategory primitiveCategory = typeInfo.getPrimitiveCategory(); switch(primitiveCategory) { case STRING: - return getLazyStringObjectInspector(escaped, escapeChar); + return getLazyStringObjectInspector(escaped, escapeChar, quoted, quotechar, dblQuoteAsQuote); case CHAR: - return getLazyHiveCharObjectInspector((CharTypeInfo)typeInfo, escaped, escapeChar); + return getLazyHiveCharObjectInspector((CharTypeInfo)typeInfo, escaped, escapeChar,quoted, quotechar, dblQuoteAsQuote ); case VARCHAR: - return getLazyHiveVarcharObjectInspector((VarcharTypeInfo)typeInfo, escaped, escapeChar); + return getLazyHiveVarcharObjectInspector((VarcharTypeInfo)typeInfo, escaped, escapeChar, quoted, quotechar, dblQuoteAsQuote); case BOOLEAN: return getLazyBooleanObjectInspector(extBoolean); default: @@ -160,46 +168,64 @@ private LazyPrimitiveObjectInspectorFactory() { cachedPrimitiveLazyObjectInspectors.put(typeInfo, poi); return poi; } + + public static LazyStringObjectInspector getLazyStringObjectInspector(boolean escaped, byte escapeChar) + { + return getLazyStringObjectInspector( escaped, escapeChar, false, (byte)0, false); + } - public static LazyStringObjectInspector getLazyStringObjectInspector(boolean escaped, byte escapeChar) { + + public static LazyStringObjectInspector getLazyStringObjectInspector(boolean escaped, byte escapeChar, + boolean quoted, byte quotechar, boolean dblQuoteAsQuote) { ArrayList signature = new ArrayList(); signature.add(TypeInfoFactory.stringTypeInfo); signature.add(Boolean.valueOf(escaped)); signature.add(Byte.valueOf(escapeChar)); + signature.add(Boolean.valueOf(quoted)); + signature.add(Byte.valueOf(quotechar)); + signature.add(Boolean.valueOf(dblQuoteAsQuote)); LazyStringObjectInspector result = (LazyStringObjectInspector) cachedLazyStringTypeOIs .get(signature); if (result == null) { - result = new LazyStringObjectInspector(escaped, escapeChar); + result = new LazyStringObjectInspector(escaped, escapeChar, quoted, quotechar, dblQuoteAsQuote); cachedLazyStringTypeOIs.put(signature, result); } return result; } public static LazyHiveCharObjectInspector getLazyHiveCharObjectInspector( - CharTypeInfo typeInfo, boolean escaped, byte escapeChar) { + CharTypeInfo typeInfo, boolean escaped, byte escapeChar, boolean quoted, byte quotechar, boolean dblQuoteAsQuote) { ArrayList signature = new ArrayList(); signature.add(typeInfo); signature.add(Boolean.valueOf(escaped)); signature.add(Byte.valueOf(escapeChar)); + signature.add(Boolean.valueOf(quoted)); + signature.add(Byte.valueOf(quotechar)); + signature.add(Boolean.valueOf(dblQuoteAsQuote)); + LazyHiveCharObjectInspector result = (LazyHiveCharObjectInspector) cachedLazyStringTypeOIs .get(signature); if (result == null) { - result = new LazyHiveCharObjectInspector(typeInfo, escaped, escapeChar); + result = new LazyHiveCharObjectInspector(typeInfo, escaped, escapeChar, quoted, quotechar, dblQuoteAsQuote); cachedLazyStringTypeOIs.put(signature, result); } return result; } public static LazyHiveVarcharObjectInspector getLazyHiveVarcharObjectInspector( - VarcharTypeInfo typeInfo, boolean escaped, byte escapeChar) { + VarcharTypeInfo typeInfo, boolean escaped, byte escapeChar, boolean quoted, byte quotechar, boolean dblQuoteAsQuote) { ArrayList signature = new ArrayList(); signature.add(typeInfo); signature.add(Boolean.valueOf(escaped)); signature.add(Byte.valueOf(escapeChar)); + signature.add(Boolean.valueOf(quoted)); + signature.add(Byte.valueOf(quotechar)); + signature.add(Boolean.valueOf(dblQuoteAsQuote)); + LazyHiveVarcharObjectInspector result = (LazyHiveVarcharObjectInspector) cachedLazyStringTypeOIs .get(signature); if (result == null) { - result = new LazyHiveVarcharObjectInspector(typeInfo, escaped, escapeChar); + result = new LazyHiveVarcharObjectInspector(typeInfo, escaped, escapeChar, quoted, quotechar, dblQuoteAsQuote); cachedLazyStringTypeOIs.put(signature, result); } return result; diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyStringObjectInspector.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyStringObjectInspector.java index 28a25d6..f99f1b5 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyStringObjectInspector.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyStringObjectInspector.java @@ -20,6 +20,7 @@ import org.apache.hadoop.hive.serde2.lazy.LazyString; import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hadoop.io.Text; /** @@ -28,17 +29,29 @@ public class LazyStringObjectInspector extends AbstractPrimitiveLazyObjectInspector implements StringObjectInspector { - private boolean escaped; - private byte escapeChar; + private boolean escaped =false; + private byte escapeChar = '\\'; + private boolean quoted=false; + private byte quoteChar ='\"'; + private boolean dblQuotesAsQuote=false; protected LazyStringObjectInspector() { super(); } - LazyStringObjectInspector(boolean escaped, byte escapeChar) { + LazyStringObjectInspector( boolean escaped, byte escapeChar ) { + this( escaped, escapeChar, false, (byte)0,false ); + } + + LazyStringObjectInspector(boolean escaped, byte escapeChar,boolean quoted, byte quotechar, + boolean dblQuotesAsQuote) + { super(TypeInfoFactory.stringTypeInfo); this.escaped = escaped; this.escapeChar = escapeChar; + this.quoted=quoted; + this.quoteChar=quotechar; + this.dblQuotesAsQuote = dblQuotesAsQuote; } @Override @@ -63,5 +76,17 @@ public boolean isEscaped() { public byte getEscapeChar() { return escapeChar; } + + public boolean isQuoted() { + return quoted; + } + + public byte getQuoteChar() { + return quoteChar; + } + + public boolean treatDblQuotesAsQuote() { + return dblQuotesAsQuote; + } } diff --git serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazyPrimitive.java serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazyPrimitive.java index 3d7f11e..9db250a 100644 --- serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazyPrimitive.java +++ serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazyPrimitive.java @@ -377,6 +377,27 @@ public void testLazyString() throws Throwable { } } + /** + * Test the LazyString class. + */ + public void testLazyStringVariant2() throws Throwable { + try { + LazyString b = new LazyString(LazyPrimitiveObjectInspectorFactory + .getLazyStringObjectInspector(false, (byte) 0, false, (byte)0, false)); + initLazyObject(b, new byte[] {'0'}, 0, 0); + assertEquals(new Text(""), b.getWritableObject()); + initLazyObject(b, new byte[] {'0'}, 0, 1); + assertEquals(new Text("0"), b.getWritableObject()); + initLazyObject(b, new byte[] {'0', '1', '2'}, 1, 1); + assertEquals(new Text("1"), b.getWritableObject()); + + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + } + + public void testLazyBinary() { LazyBinary ba = new LazyBinary(LazyPrimitiveObjectInspectorFactory.LAZY_BINARY_OBJECT_INSPECTOR); initLazyObject(ba, new byte[] {}, 0, 0); diff --git serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazySimpleSerDe.java serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazySimpleSerDe.java index cf86973..67973c2 100644 --- serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazySimpleSerDe.java +++ serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazySimpleSerDe.java @@ -29,6 +29,8 @@ import org.apache.hadoop.hive.serde2.SerDeUtils; import org.apache.hadoop.hive.serde2.io.ByteWritable; import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.HiveCharWritable; +import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; import org.apache.hadoop.hive.serde2.io.ShortWritable; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; @@ -94,6 +96,7 @@ private void deserializeAndSerialize(LazySimpleSerDe serDe, Text t, String s, if (fieldData != null) { fieldData = ((LazyPrimitive) fieldData).getWritableObject(); } + assertEquals("Field " + i, expectedFieldsData[i], fieldData); } // Serialize @@ -199,5 +202,551 @@ public void testLazySimpleSerDeMissingColumns() throws Throwable { throw e; } } + + + public void testLazySimpleSerDeLeftTrimVariant1() throws Throwable { + try { + // Create the SerDe + LazySimpleSerDe serDe = new LazySimpleSerDe(); + Configuration conf = new Configuration(); + Properties tbl = new Properties(); + tbl.setProperty(serdeConstants.SERIALIZATION_FORMAT, ","); + tbl.setProperty(serdeConstants.ESCAPE_CHAR, "\\"); + tbl.setProperty(serdeConstants.QUOTE_CHAR, "\""); + tbl.setProperty(serdeConstants.FIELD_DELIM, ","); + tbl.setProperty(serdeConstants.SERIALIZATION_FORMAT, ","); + tbl.setProperty("columns", + "pk,col1,col2,col3"); + tbl.setProperty("columns.types", + "smallint:string:string:int"); + tbl.setProperty(serdeConstants.SERIALIZATION_NULL_FORMAT, "NULL"); + tbl.setProperty(LazySimpleSerDe.FIELD_LTRIM,"true"); + + serDe.initialize(conf, tbl); + + // Data + String sData = " 123, 456, 789, 1000"; + Text t = new Text(sData); + Object[] expectedFieldsData = { + new ShortWritable((short) 123), + new Text("456"), + new Text("789"), + new IntWritable(1000)}; + + // Test + deserializeAndSerialize(serDe, t, "123,456,789,1000", expectedFieldsData); + + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + } + + public void testLazySimpleSerDeLeftTrimVariant2() throws Throwable { + try { + // Create the SerDe + LazySimpleSerDe serDe = new LazySimpleSerDe(); + Configuration conf = new Configuration(); + Properties tbl = new Properties(); + tbl.setProperty(serdeConstants.SERIALIZATION_FORMAT, ","); + tbl.setProperty(serdeConstants.ESCAPE_CHAR, "\\"); + tbl.setProperty(serdeConstants.QUOTE_CHAR, "\""); + tbl.setProperty(serdeConstants.FIELD_DELIM, ","); + tbl.setProperty(serdeConstants.SERIALIZATION_FORMAT, ","); + tbl.setProperty("columns", + "pk,col1,col2,col3"); + tbl.setProperty("columns.types", + "smallint:string:string:int"); + tbl.setProperty(serdeConstants.SERIALIZATION_NULL_FORMAT, "NULL"); + tbl.setProperty(LazySimpleSerDe.FIELD_LTRIM,"true"); + + serDe.initialize(conf, tbl); + + // Data + String sData = " 123, \" 456, 789\", 789, 1000"; + Text t = new Text(sData); + Object[] expectedFieldsData = { + new ShortWritable((short) 123), + new Text(" 456, 789"), + new Text("789"), + new IntWritable(1000)}; + + // Test + deserializeAndSerialize(serDe, t, "123, 456\\, 789,789,1000", expectedFieldsData); + + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + } + + public void testLazySimpleSerDeEmbeddedQuotesVariant1() throws Throwable { + try { + // Create the SerDe + LazySimpleSerDe serDe = new LazySimpleSerDe(); + Configuration conf = new Configuration(); + Properties serdeProps = new Properties(); + serdeProps.setProperty(serdeConstants.SERIALIZATION_FORMAT, ","); + serdeProps.setProperty(serdeConstants.ESCAPE_CHAR, "\\"); + serdeProps.setProperty(serdeConstants.QUOTE_CHAR, "\""); + serdeProps.setProperty(serdeConstants.FIELD_DELIM, ","); + serdeProps.setProperty(serdeConstants.SERIALIZATION_FORMAT, ","); + serdeProps.setProperty("columns", + "pk,col1,col2,col3"); + serdeProps.setProperty("columns.types", + "smallint:string:string:int"); + serdeProps.setProperty(serdeConstants.SERIALIZATION_NULL_FORMAT, "NULL"); + serdeProps.setProperty(LazySimpleSerDe.FIELD_LTRIM,"true"); + + serDe.initialize(conf, serdeProps); + + // Data + String sData = " 123, \" 456\"\", 789\", 789, 1000"; + Text t = new Text(sData); + Object[] expectedFieldsData = { + new ShortWritable((short) 123), + new Text(" 456\"\", 789"), + new Text("789"), + new IntWritable(1000)}; + + // Test + deserializeAndSerialize(serDe, t, "123, 456\\\"\\\"\\, 789,789,1000", expectedFieldsData); + + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + } + + public void testLazySimpleSerDeEmbeddedQuotesVariant2() throws Throwable { + try { + // Create the SerDe + LazySimpleSerDe serDe = new LazySimpleSerDe(); + Configuration conf = new Configuration(); + Properties serdeProps = new Properties(); + serdeProps.setProperty(serdeConstants.SERIALIZATION_FORMAT, ","); + serdeProps.setProperty(serdeConstants.ESCAPE_CHAR, "\\"); + serdeProps.setProperty(serdeConstants.QUOTE_CHAR, "\""); + serdeProps.setProperty(serdeConstants.FIELD_DELIM, ","); + serdeProps.setProperty(serdeConstants.SERIALIZATION_FORMAT, ","); + serdeProps.setProperty("columns", + "pk,col1,col2,col3"); + serdeProps.setProperty("columns.types", + "smallint:string:string:int"); + serdeProps.setProperty(serdeConstants.SERIALIZATION_NULL_FORMAT, "NULL"); + serdeProps.setProperty(LazySimpleSerDe.FIELD_LTRIM,"true"); + serdeProps.setProperty(LazySimpleSerDe.FIELD_DOUBLEQUOTESASQUOTE,"true"); + + serDe.initialize(conf, serdeProps); + + // Data + String sData = " 123, \" 456\"\", 789\", 789, 1000"; + Text t = new Text(sData); + Object[] expectedFieldsData = { + new ShortWritable((short) 123), + new Text(" 456\", 789"), + new Text("789"), + new IntWritable(1000)}; + + // Test + deserializeAndSerialize(serDe, t, "123, 456\\\"\\, 789,789,1000", expectedFieldsData); + + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + } + + public void testLazySimpleSerDeEmbeddedQuotesVariant2a() throws Throwable { + try { + // Create the SerDe + LazySimpleSerDe serDe = new LazySimpleSerDe(); + Configuration conf = new Configuration(); + Properties serdeProps = new Properties(); + serdeProps.setProperty(serdeConstants.SERIALIZATION_FORMAT, ","); + serdeProps.setProperty(serdeConstants.ESCAPE_CHAR, "\\"); + serdeProps.setProperty(serdeConstants.QUOTE_CHAR, "\""); + serdeProps.setProperty(serdeConstants.FIELD_DELIM, ","); + serdeProps.setProperty(serdeConstants.SERIALIZATION_FORMAT, ","); + serdeProps.setProperty("columns", + "pk,col1,col2,col3"); + serdeProps.setProperty("columns.types", + "smallint:varchar(50):varchar(50):int"); + serdeProps.setProperty(serdeConstants.SERIALIZATION_NULL_FORMAT, "NULL"); + serdeProps.setProperty(LazySimpleSerDe.FIELD_LTRIM,"true"); + serdeProps.setProperty(LazySimpleSerDe.FIELD_DOUBLEQUOTESASQUOTE,"true"); + + serDe.initialize(conf, serdeProps); + + // Data + String sData = " 123, \" 456\"\", 789\", 789, 1000"; + Text t = new Text(sData); + + HiveVarcharWritable a = new HiveVarcharWritable(); + a.set(" 456\", 789"); + HiveVarcharWritable b=new HiveVarcharWritable(); + b.set("789"); + Object[] expectedFieldsData = { + new ShortWritable((short) 123), + a, + b, + new IntWritable(1000)}; + + // Test + deserializeAndSerialize(serDe, t, "123, 456\\\"\\, 789,789,1000", expectedFieldsData); + + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + } + + public void testLazySimpleSerDeEmbeddedQuotesVariant2b() throws Throwable { + try { + // Create the SerDe + LazySimpleSerDe serDe = new LazySimpleSerDe(); + Configuration conf = new Configuration(); + Properties serdeProps = new Properties(); + serdeProps.setProperty(serdeConstants.SERIALIZATION_FORMAT, ","); + serdeProps.setProperty(serdeConstants.ESCAPE_CHAR, "\\"); + serdeProps.setProperty(serdeConstants.QUOTE_CHAR, "\""); + serdeProps.setProperty(serdeConstants.FIELD_DELIM, ","); + serdeProps.setProperty(serdeConstants.SERIALIZATION_FORMAT, ","); + serdeProps.setProperty("columns", + "pk,col1,col2,col3"); + serdeProps.setProperty("columns.types", + "smallint:char(10):char(10):int"); + serdeProps.setProperty(serdeConstants.SERIALIZATION_NULL_FORMAT, "NULL"); + serdeProps.setProperty(LazySimpleSerDe.FIELD_LTRIM,"true"); + serdeProps.setProperty(LazySimpleSerDe.FIELD_DOUBLEQUOTESASQUOTE,"true"); + + serDe.initialize(conf, serdeProps); + + // Data + String sData = " 123, \" 456\"\", 789\", 789, 1000"; + Text t = new Text(sData); + + HiveCharWritable a = new HiveCharWritable(); + a.set(" 456\", 789"); + HiveCharWritable b=new HiveCharWritable(); + b.set("789 "); + Object[] expectedFieldsData = { + new ShortWritable((short) 123), + a, + b, + new IntWritable(1000)}; + + // Test + deserializeAndSerialize(serDe, t, "123, 456\\\"\\, 789,789 ,1000", expectedFieldsData); + + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + } + + /*** + * Test that there is no trim or quote handling when options are off + */ + public void testLazySimpleSerDeQuoteHandlingLeftTrimOnly() throws Throwable { + try { + // Create the SerDe + LazySimpleSerDe serDe = new LazySimpleSerDe(); + Configuration conf = new Configuration(); + Properties serdeProps = new Properties(); + serdeProps.setProperty(serdeConstants.SERIALIZATION_FORMAT, ","); + serdeProps.setProperty(serdeConstants.ESCAPE_CHAR, "\\"); + serdeProps.setProperty(serdeConstants.FIELD_DELIM, ","); + serdeProps.setProperty(serdeConstants.SERIALIZATION_FORMAT, ","); + serdeProps.setProperty("columns", + "pk,col1,col2,col3,col4"); + serdeProps.setProperty("columns.types", + "smallint:string:string:string:int"); + serdeProps.setProperty(serdeConstants.SERIALIZATION_NULL_FORMAT, "NULL"); + serdeProps.setProperty(LazySimpleSerDe.FIELD_LTRIM, "true"); + + + serDe.initialize(conf, serdeProps); + + // Data + String sData = " 123, \" 456\"\", 789\" , 789, 1000"; + Text t = new Text(sData); + Object[] expectedFieldsData = { + new ShortWritable((short) 123), + new Text("\" 456\"\""), + new Text("789\" "), + new Text("789"), new IntWritable(1000)}; + + // Test + deserializeAndSerialize(serDe, t, "123,\" 456\"\",789\" ,789,1000", expectedFieldsData); + + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + } + + /*** + * Test that there is no trim or quote handling when options are off + */ + public void testLazySimpleSerDeQuoteHandlingRightTrimOnly() throws Throwable { + try { + // Create the SerDe + LazySimpleSerDe serDe = new LazySimpleSerDe(); + Configuration conf = new Configuration(); + Properties serdeProps = new Properties(); + serdeProps.setProperty(serdeConstants.SERIALIZATION_FORMAT, ","); + serdeProps.setProperty(serdeConstants.ESCAPE_CHAR, "\\"); + serdeProps.setProperty(serdeConstants.FIELD_DELIM, ","); + serdeProps.setProperty(serdeConstants.SERIALIZATION_FORMAT, ","); + serdeProps.setProperty("columns", + "pk,col1,col2,col3,col4"); + serdeProps.setProperty("columns.types", + "smallint:string:string:string:int"); + serdeProps.setProperty(serdeConstants.SERIALIZATION_NULL_FORMAT, "NULL"); + serdeProps.setProperty(LazySimpleSerDe.FIELD_RTRIM, "true"); + + + serDe.initialize(conf, serdeProps); + + // Data + String sData = " 123, \" 456\"\", 789\" , 789, 1000"; + Text t = new Text(sData); + Object[] expectedFieldsData = { + null, + new Text(" \" 456\"\""), + new Text(" 789\""), + new Text(" 789"), null}; + + // Test + deserializeAndSerialize(serDe, t, "NULL, \" 456\"\", 789\", 789,NULL", expectedFieldsData); + + sData = "123 , \" 456\"\", 789\" ,789 ,1000 "; + t = new Text(sData); + Object[] expectedFieldsData2 = { + new ShortWritable((short)123), + new Text(" \" 456\"\""), + new Text(" 789\""), + new Text("789"), new IntWritable(1000)}; + + // Test + deserializeAndSerialize(serDe, t, "123, \" 456\"\", 789\",789,1000", expectedFieldsData2); + + serdeProps.setProperty("columns", + "pk,col1,col2,col3,col4,col5,col6,col7"); + serdeProps.setProperty("columns.types", + "smallint:string:string:string:int:string:int:string"); + serdeProps.setProperty(serdeConstants.SERIALIZATION_NULL_FORMAT, "NULL"); + serdeProps.setProperty(LazySimpleSerDe.FIELD_RTRIM, "true"); + serDe.initialize(conf, serdeProps); + + sData = "123 , \" 456\"\", 789\" ,789 ,1000 "; + t = new Text(sData); + Object[] expectedFieldsData3 = { + new ShortWritable((short)123), + new Text(" \" 456\"\""), + new Text(" 789\""), + new Text("789"), new IntWritable(1000), + null, null, null}; + + // Test + deserializeAndSerialize(serDe, t, "123, \" 456\"\", 789\",789,1000,NULL,NULL,NULL", expectedFieldsData3); + + serdeProps.setProperty("columns", + "pk,col1,col2,col3,col4"); + serdeProps.setProperty("columns.types", + "smallint:string:string:string:string"); + serdeProps.setProperty(serdeConstants.SERIALIZATION_NULL_FORMAT, "NULL"); + serdeProps.setProperty(LazySimpleSerDe.FIELD_RTRIM, "true"); + serdeProps.setProperty(serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST, "true"); + serDe.initialize(conf, serdeProps); + + sData = "123 , \" 456\"\", 789\" ,789 ,1000 , 23 "; + t = new Text(sData); + Object[] expectedFieldsData4 = { + new ShortWritable((short)123), + new Text(" \" 456\"\""), + new Text(" 789\""), + new Text("789"), new Text("1000 , 23") + }; + + // Test + deserializeAndSerialize(serDe, t, "123, \" 456\"\", 789\",789,1000 \\, 23", expectedFieldsData4); + + + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + } + + /*** + * Test that there is no trim or quote handling when options are off + */ + public void testLazySimpleSerDeQuoteHandlingLeftAndRightTrim() throws Throwable { + try { + // Create the SerDe + LazySimpleSerDe serDe = new LazySimpleSerDe(); + Configuration conf = new Configuration(); + Properties serdeProps = new Properties(); + serdeProps.setProperty(serdeConstants.SERIALIZATION_FORMAT, ","); + serdeProps.setProperty(serdeConstants.ESCAPE_CHAR, "\\"); + serdeProps.setProperty(serdeConstants.FIELD_DELIM, ","); + serdeProps.setProperty(serdeConstants.SERIALIZATION_FORMAT, ","); + serdeProps.setProperty("columns", + "pk,col1,col2,col3,col4"); + serdeProps.setProperty("columns.types", + "smallint:string:string:string:int"); + serdeProps.setProperty(serdeConstants.SERIALIZATION_NULL_FORMAT, "NULL"); + serdeProps.setProperty(LazySimpleSerDe.FIELD_RTRIM, "true"); + serdeProps.setProperty(LazySimpleSerDe.FIELD_LTRIM, "true"); + + + serDe.initialize(conf, serdeProps); + + // Data + String sData = " 123, \" 456\"\", 789\" , 789, 1000"; + Text t = new Text(sData); + Object[] expectedFieldsData = { + new ShortWritable((short)123), + new Text("\" 456\"\""), + new Text("789\""), + new Text("789"), new IntWritable(1000)}; + + // Test + deserializeAndSerialize(serDe, t, "123,\" 456\"\",789\",789,1000", expectedFieldsData); + + sData = "123 , \" 456\"\", 789\" ,789 ,1000 "; + t = new Text(sData); + Object[] expectedFieldsData2 = { + new ShortWritable((short)123), + new Text("\" 456\"\""), + new Text("789\""), + new Text("789"), new IntWritable(1000)}; + + // Test + deserializeAndSerialize(serDe, t, "123,\" 456\"\",789\",789,1000", expectedFieldsData2); + + serdeProps.setProperty("columns", + "pk,col1,col2,col3,col4,col5,col6,col7"); + serdeProps.setProperty("columns.types", + "smallint:string:string:string:int:string:int:string"); + serDe.initialize(conf, serdeProps); + + sData = "123 , \" 456\"\", 789\" ,789 ,1000 "; + t = new Text(sData); + Object[] expectedFieldsData3 = { + new ShortWritable((short)123), + new Text("\" 456\"\""), + new Text("789\""), + new Text("789"), new IntWritable(1000), + null, null, null}; + + // Test + deserializeAndSerialize(serDe, t, "123,\" 456\"\",789\",789,1000,NULL,NULL,NULL", expectedFieldsData3); + + serdeProps.setProperty("columns", + "pk,col1,col2,col3,col4"); + serdeProps.setProperty("columns.types", + "smallint:string:string:string:string"); + serdeProps.setProperty(serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST, "true"); + serDe.initialize(conf, serdeProps); + + sData = "123 , \" 456\"\", 789\" ,789 ,1000 , 23 "; + t = new Text(sData); + Object[] expectedFieldsData4 = { + new ShortWritable((short)123), + new Text("\" 456\"\""), + new Text("789\""), + new Text("789"), new Text("1000 , 23") + }; + + // Test + deserializeAndSerialize(serDe, t, "123,\" 456\"\",789\",789,1000 \\, 23", expectedFieldsData4); + + + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + } + + + /*** + * Test that there is no trim or quote handling when options are off + */ + public void testLazySimpleSerDeQuoteHandlingWithoutOptions() throws Throwable { + try { + // Create the SerDe + LazySimpleSerDe serDe = new LazySimpleSerDe(); + Configuration conf = new Configuration(); + Properties serdeProps = new Properties(); + serdeProps.setProperty(serdeConstants.SERIALIZATION_FORMAT, ","); + serdeProps.setProperty(serdeConstants.ESCAPE_CHAR, "\\"); + serdeProps.setProperty(serdeConstants.FIELD_DELIM, ","); + serdeProps.setProperty(serdeConstants.SERIALIZATION_FORMAT, ","); + serdeProps.setProperty("columns", + "pk,col1,col2,col3,col4"); + serdeProps.setProperty("columns.types", + "smallint:string:string:string:int"); + serdeProps.setProperty(serdeConstants.SERIALIZATION_NULL_FORMAT, "NULL"); + + serDe.initialize(conf, serdeProps); + + // Data + String sData = " 123, \" 456\"\", 789\" , 789, 1000"; + Text t = new Text(sData); + Object[] expectedFieldsData = { + null, + new Text(" \" 456\"\""), + new Text(" 789\" "), + new Text(" 789"), null}; + + // Test + deserializeAndSerialize(serDe, t, "NULL, \" 456\"\", 789\" , 789,NULL", expectedFieldsData); + + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + } + + public void testLazySimpleSerDeEmbeddedQuotesVariant3() throws Throwable { + try { + // Create the SerDe + LazySimpleSerDe serDe = new LazySimpleSerDe(); + Configuration conf = new Configuration(); + Properties serdeProps = new Properties(); + serdeProps.setProperty(serdeConstants.SERIALIZATION_FORMAT, ","); + serdeProps.setProperty(serdeConstants.ESCAPE_CHAR, "\\"); + serdeProps.setProperty(serdeConstants.QUOTE_CHAR, "\""); + serdeProps.setProperty(serdeConstants.FIELD_DELIM, ","); + serdeProps.setProperty(serdeConstants.SERIALIZATION_FORMAT, ","); + serdeProps.setProperty("columns", + "pk,col1,col2,col3"); + serdeProps.setProperty("columns.types", + "smallint:string:string:int"); + serdeProps.setProperty(serdeConstants.SERIALIZATION_NULL_FORMAT, "NULL"); + serdeProps.setProperty(LazySimpleSerDe.FIELD_LTRIM,"true"); + + serDe.initialize(conf, serdeProps); + + // Data + String sData = " 123, \" 456\\\\\"\", 789\", 789, 1000"; + Text t = new Text(sData); + Object[] expectedFieldsData = { + new ShortWritable((short) 123), + new Text(" 456\\\"\", 789"), + new Text("789"), + new IntWritable(1000)}; + + // Test + deserializeAndSerialize(serDe, t, "123, 456\\\\\\\"\\\"\\, 789,789,1000", expectedFieldsData); + + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + } }