diff --git a/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/MultiDelimitSerDe.java b/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/MultiDelimitSerDe.java index 957804a..3e1be7d 100644 --- a/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/MultiDelimitSerDe.java +++ b/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/MultiDelimitSerDe.java @@ -60,6 +60,7 @@ serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES, serdeConstants.FIELD_DELIM, serdeConstants.COLLECTION_DELIM, serdeConstants.MAPKEY_DELIM, serdeConstants.SERIALIZATION_FORMAT, serdeConstants.SERIALIZATION_NULL_FORMAT, + serdeConstants.SERIALIZATION_ESCAPE_CRLF, serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST, serdeConstants.ESCAPE_CHAR, serdeConstants.SERIALIZATION_ENCODING, diff --git a/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseSerDe.java b/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseSerDe.java index ed2df5f..1adb913 100644 --- a/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseSerDe.java +++ b/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseSerDe.java @@ -48,6 +48,7 @@ serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES, serdeConstants.FIELD_DELIM, serdeConstants.COLLECTION_DELIM, serdeConstants.MAPKEY_DELIM, serdeConstants.SERIALIZATION_FORMAT, serdeConstants.SERIALIZATION_NULL_FORMAT, + serdeConstants.SERIALIZATION_ESCAPE_CRLF, serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST, serdeConstants.ESCAPE_CHAR, serdeConstants.SERIALIZATION_ENCODING, diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java index b50eaab..f17c063 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java @@ -282,9 +282,10 @@ public static TableDesc getDefaultQueryOutputTableDesc(String cols, String colTy false, false, fileFormat); //enable escaping tblDesc.getProperties().setProperty(serdeConstants.ESCAPE_CHAR, "\\"); + tblDesc.getProperties().setProperty(serdeConstants.SERIALIZATION_ESCAPE_CRLF, "true"); //enable extended nesting levels tblDesc.getProperties().setProperty( - LazySerDeParameters.SERIALIZATION_EXTEND_ADDITIONAL_NESTING_LEVELS, "true"); + LazySerDeParameters.SERIALIZATION_EXTEND_ADDITIONAL_NESTING_LEVELS, "true"); return tblDesc; } diff --git a/ql/src/test/queries/clientpositive/escape_crlf.q b/ql/src/test/queries/clientpositive/escape_crlf.q new file mode 100644 index 0000000..d881c30 --- /dev/null +++ b/ql/src/test/queries/clientpositive/escape_crlf.q @@ -0,0 +1,20 @@ +-- base table with null data +DROP TABLE IF EXISTS base_tab; +CREATE TABLE base_tab(a STRING, b STRING); +DESCRIBE EXTENDED base_tab; + +INSERT INTO base_tab VALUES('This\ris\nthe first\r\nmulti-line field', 'field1-2'); +INSERT INTO base_tab VALUES('This\ris\nthe second\r\nmulti-line field', 'field2-2'); + +-- No crlf escaping +SELECT * FROM base_tab; + +-- Crlf escaping +ALTER TABLE base_tab SET SERDEPROPERTIES ('escape.delim'='\\', 'serialization.escape.crlf'='true'); +SELECT * FROM base_tab; + +SET hive.fetch.task.conversion=none; +-- Make sure intermediate serde works correctly +SELECT * FROM base_tab; + +DROP TABLE base_tab; diff --git a/ql/src/test/results/clientpositive/escape_crlf.q.out b/ql/src/test/results/clientpositive/escape_crlf.q.out new file mode 100644 index 0000000..c2968cc --- /dev/null +++ b/ql/src/test/results/clientpositive/escape_crlf.q.out @@ -0,0 +1,108 @@ +PREHOOK: query: -- base table with null data +DROP TABLE IF EXISTS base_tab +PREHOOK: type: DROPTABLE +POSTHOOK: query: -- base table with null data +DROP TABLE IF EXISTS base_tab +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE base_tab(a STRING, b STRING) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@base_tab +POSTHOOK: query: CREATE TABLE base_tab(a STRING, b STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@base_tab +PREHOOK: query: DESCRIBE EXTENDED base_tab +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@base_tab +POSTHOOK: query: DESCRIBE EXTENDED base_tab +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@base_tab +a string +b string + +#### A masked pattern was here #### +PREHOOK: query: INSERT INTO base_tab VALUES('This\ris\nthe first\r\nmulti-line field', 'field1-2') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@base_tab +POSTHOOK: query: INSERT INTO base_tab VALUES('This\ris\nthe first\r\nmulti-line field', 'field1-2') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@base_tab +POSTHOOK: Lineage: base_tab.a SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: base_tab.b SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: INSERT INTO base_tab VALUES('This\ris\nthe second\r\nmulti-line field', 'field2-2') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__2 +PREHOOK: Output: default@base_tab +POSTHOOK: query: INSERT INTO base_tab VALUES('This\ris\nthe second\r\nmulti-line field', 'field2-2') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__2 +POSTHOOK: Output: default@base_tab +POSTHOOK: Lineage: base_tab.a SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: base_tab.b SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: -- No crlf escaping +SELECT * FROM base_tab +PREHOOK: type: QUERY +PREHOOK: Input: default@base_tab +#### A masked pattern was here #### +POSTHOOK: query: -- No crlf escaping +SELECT * FROM base_tab +POSTHOOK: type: QUERY +POSTHOOK: Input: default@base_tab +#### A masked pattern was here #### +This\ris\nthe first\r\nmulti-line field field1-2 +This\ris\nthe second\r\nmulti-line field field2-2 +PREHOOK: query: -- Crlf escaping +ALTER TABLE base_tab SET SERDEPROPERTIES ('escape.delim'='\\', 'serialization.escape.crlf'='true') +PREHOOK: type: ALTERTABLE_SERDEPROPERTIES +PREHOOK: Input: default@base_tab +PREHOOK: Output: default@base_tab +POSTHOOK: query: -- Crlf escaping +ALTER TABLE base_tab SET SERDEPROPERTIES ('escape.delim'='\\', 'serialization.escape.crlf'='true') +POSTHOOK: type: ALTERTABLE_SERDEPROPERTIES +POSTHOOK: Input: default@base_tab +POSTHOOK: Output: default@base_tab +PREHOOK: query: SELECT * FROM base_tab +PREHOOK: type: QUERY +PREHOOK: Input: default@base_tab +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM base_tab +POSTHOOK: type: QUERY +POSTHOOK: Input: default@base_tab +#### A masked pattern was here #### +This +is +the first +multi-line field field1-2 +This +is +the second +multi-line field field2-2 +PREHOOK: query: -- Make sure intermediate serde works correctly +SELECT * FROM base_tab +PREHOOK: type: QUERY +PREHOOK: Input: default@base_tab +#### A masked pattern was here #### +POSTHOOK: query: -- Make sure intermediate serde works correctly +SELECT * FROM base_tab +POSTHOOK: type: QUERY +POSTHOOK: Input: default@base_tab +#### A masked pattern was here #### +This +is +the first +multi-line field field1-2 +This +is +the second +multi-line field field2-2 +PREHOOK: query: DROP TABLE base_tab +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@base_tab +PREHOOK: Output: default@base_tab +POSTHOOK: query: DROP TABLE base_tab +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@base_tab +POSTHOOK: Output: default@base_tab diff --git a/serde/if/serde.thrift b/serde/if/serde.thrift index d828bac..000e414 100644 --- a/serde/if/serde.thrift +++ b/serde/if/serde.thrift @@ -27,6 +27,7 @@ const string SERIALIZATION_CLASS = "serialization.class" const string SERIALIZATION_FORMAT = "serialization.format" const string SERIALIZATION_DDL = "serialization.ddl" const string SERIALIZATION_NULL_FORMAT = "serialization.null.format" +const string SERIALIZATION_ESCAPE_CRLF = "serialization.escape_crlf" const string SERIALIZATION_LAST_COLUMN_TAKES_REST = "serialization.last.column.takes.rest" const string SERIALIZATION_SORT_ORDER = "serialization.sort.order" const string SERIALIZATION_USE_JSON_OBJECTS = "serialization.use.json.object" diff --git a/serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java b/serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java index f12d4c7..7902849 100644 --- a/serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java +++ b/serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java @@ -46,6 +46,8 @@ public static final String SERIALIZATION_NULL_FORMAT = "serialization.null.format"; + public static final String SERIALIZATION_ESCAPE_CRLF = "serialization.escape.crlf"; + public static final String SERIALIZATION_LAST_COLUMN_TAKES_REST = "serialization.last.column.takes.rest"; public static final String SERIALIZATION_SORT_ORDER = "serialization.sort.order"; diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java index a611c05..0ca8e2d 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java @@ -55,6 +55,7 @@ serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES, serdeConstants.FIELD_DELIM, serdeConstants.COLLECTION_DELIM, serdeConstants.MAPKEY_DELIM, serdeConstants.SERIALIZATION_FORMAT, serdeConstants.SERIALIZATION_NULL_FORMAT, + serdeConstants.SERIALIZATION_ESCAPE_CRLF, serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST, serdeConstants.ESCAPE_CHAR, serdeConstants.SERIALIZATION_ENCODING, @@ -112,11 +113,6 @@ public void initialize(Configuration conf, Properties tbl) throws SerDeException cachedObjectInspector, notSkipIDs, serdeParams.getNullSequence()); super.initialize(size); - LOG.debug("ColumnarSerDe initialized with: columnNames=" - + serdeParams.getColumnNames() + " columnTypes=" - + serdeParams.getColumnTypes() + " separator=" - + Arrays.asList(serdeParams.getSeparators()) + " nullstring=" - + serdeParams.getNullString()); } /** diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySerDeParameters.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySerDeParameters.java index 60d11a2..2ab6c5b 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySerDeParameters.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySerDeParameters.java @@ -25,6 +25,8 @@ import java.util.Map; import java.util.Properties; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.serde.serdeConstants; @@ -41,6 +43,7 @@ * */ public class LazySerDeParameters implements LazyObjectInspectorParameters { + public static final Log LOG = LogFactory.getLog(LazySerDeParameters.class.getName()); public static final byte[] DefaultSeparators = {(byte) 1, (byte) 2, (byte) 3}; public static final String SERIALIZATION_EXTEND_NESTING_LEVELS = "hive.serialization.extend.nesting.levels"; @@ -53,10 +56,10 @@ // The list of bytes used for the separators in the column (a nested struct // such as Array> will use multiple separators). // The list of separators + escapeChar are the bytes required to be escaped. - private byte[] separators; + private byte[] separators; - private String nullString; private Text nullSequence; + private TypeInfo rowTypeInfo; private boolean lastColumnTakesRest; private List columnNames; @@ -64,7 +67,7 @@ private boolean escaped; private byte escapeChar; - private boolean[] needsEscape = new boolean[256]; // A flag for each byte to indicate if escape is needed. + private boolean[] needsEscape = new boolean[256]; // A flag for each byte to indicate if escape is needed. private boolean extendedBooleanLiteral; List timestampFormats; @@ -72,8 +75,8 @@ public LazySerDeParameters(Configuration job, Properties tbl, String serdeName) throws SerDeException { this.tableProperties = tbl; this.serdeName = serdeName; - - nullString = tbl.getProperty( + + String nullString = tbl.getProperty( serdeConstants.SERIALIZATION_NULL_FORMAT, "\\N"); nullSequence = new Text(nullString); @@ -88,8 +91,8 @@ public LazySerDeParameters(Configuration job, Properties tbl, String serdeName) rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes); collectSeparators(tbl); - - // Get the escape information + + // Get the escape information String escapeProperty = tbl.getProperty(serdeConstants.ESCAPE_CHAR); escaped = (escapeProperty != null); if (escaped) { @@ -98,8 +101,18 @@ public LazySerDeParameters(Configuration job, Properties tbl, String serdeName) for (byte b : separators) { needsEscape[b & 0xFF] = true; // Converts the negative byte into positive index } + + // '\r' and '\n' are reserved and can't be used for escape chars and separators + if (needsEscape['\r'] || needsEscape['\n']) { + throw new SerDeException("\\r and \\n cannot be used as escaping characters or separators"); + } + boolean isEscapeCRLF = Boolean.valueOf(tbl.getProperty(serdeConstants.SERIALIZATION_ESCAPE_CRLF)); + if (isEscapeCRLF) { + needsEscape['\r'] = true; + needsEscape['\n'] = true; + } } - + extendedBooleanLiteral = (job == null ? false : job.getBoolean(ConfVars.HIVE_LAZYSIMPLE_EXTENDED_BOOLEAN_LITERAL.varname, false)); @@ -108,8 +121,14 @@ public LazySerDeParameters(Configuration job, Properties tbl, String serdeName) if (timestampFormatsArray != null) { timestampFormats = Arrays.asList(timestampFormatsArray); } + + LOG.debug(serdeName + " initialized with: columnNames=" + + columnNames + " columnTypes=" + columnTypes + + " separator=" + Arrays.asList(separators) + + " nullstring=" + nullString + " lastColumnTakesRest=" + + lastColumnTakesRest + " timestampFormats=" + timestampFormats); } - + /** * Extracts and set column names and column types from the table properties * @throws SerDeException @@ -146,7 +165,7 @@ public void extractColumnInfo() throws SerDeException { + " elements while columns.types has " + columnTypes.size() + " elements!"); } } - + public List getColumnTypes() { return columnTypes; } @@ -155,14 +174,10 @@ public void extractColumnInfo() throws SerDeException { return columnNames; } - public byte[] getSeparators() { + public byte[] getSeparators() { return separators; } - public String getNullString() { - return nullString; - } - public Text getNullSequence() { return nullSequence; } diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java index c373047..cb3f9d1 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java @@ -73,6 +73,7 @@ serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES, serdeConstants.FIELD_DELIM, serdeConstants.COLLECTION_DELIM, serdeConstants.MAPKEY_DELIM, serdeConstants.SERIALIZATION_FORMAT, serdeConstants.SERIALIZATION_NULL_FORMAT, + serdeConstants.SERIALIZATION_ESCAPE_CRLF, serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST, serdeConstants.ESCAPE_CHAR, serdeConstants.SERIALIZATION_ENCODING, @@ -132,12 +133,6 @@ public void initialize(Configuration job, Properties tbl) cachedLazyStruct = (LazyStruct) LazyFactory .createLazyObject(cachedObjectInspector); - LOG.debug(getClass().getName() + " initialized with: columnNames=" - + serdeParams.getColumnNames() + " columnTypes=" + serdeParams.getColumnTypes() - + " separator=" + Arrays.asList(serdeParams.getSeparators()) - + " nullstring=" + serdeParams.getNullString() + " lastColumnTakesRest=" - + serdeParams.isLastColumnTakesRest() + " timestampFormats=" + serdeParams.getTimestampFormats()); - serializedSize = 0; stats = new SerDeStats(); lastOperationSerialize = false; diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java index a5e4be4..d6b2219 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java @@ -181,10 +181,19 @@ public static void writeEscaped(OutputStream out, byte[] bytes, int start, if (i > start) { out.write(bytes, start, i - start); } - start = i; - if (i < len) { - out.write(escapeChar); + + if (i == end) break; + + out.write(escapeChar); + if (bytes[i] == '\r') { + out.write('r'); + start = i + 1; + } else if (bytes[i] == '\n') { + out.write('n'); + start = i + 1; + } else { // the current char will be written out later. + start = i; } } } @@ -443,12 +452,19 @@ public static void copyAndEscapeStringDataToText(byte[] inputBytes, int start, i byte[] outputBytes = data.getBytes(); for (int i = 0; i < length; i++) { byte b = inputBytes[start + i]; - if (b != escapeChar || i == length - 1) { - outputBytes[k++] = b; + if (b == escapeChar && i < length - 1) { + ++i; + // Check if it's '\r' or '\n' + if (inputBytes[start + i] == 'r') { + outputBytes[k++] = '\r'; + } else if (inputBytes[start + i] == 'n') { + outputBytes[k++] = '\n'; + } else { + // get the next byte + outputBytes[k++] = inputBytes[start + i]; + } } else { - // get the next byte - i++; - outputBytes[k++] = inputBytes[start + i]; + outputBytes[k++] = b; } } assert (k == outputLength);