diff --git ql/src/test/queries/clientpositive/escape_crlf.q ql/src/test/queries/clientpositive/escape_crlf.q index 46c3605..c7ef595 100644 --- ql/src/test/queries/clientpositive/escape_crlf.q +++ ql/src/test/queries/clientpositive/escape_crlf.q @@ -1,4 +1,3 @@ --- Suppress vectorization due to known bug. See HIVE-19118. set hive.vectorized.execution.enabled=false; set hive.test.vectorized.execution.enabled.override=disable; diff --git ql/src/test/queries/clientpositive/vectorization_escape_crlf.q ql/src/test/queries/clientpositive/vectorization_escape_crlf.q new file mode 100644 index 0000000..e8ae720 --- /dev/null +++ ql/src/test/queries/clientpositive/vectorization_escape_crlf.q @@ -0,0 +1,22 @@ +set hive.vectorized.execution.enabled=true; +set hive.test.vectorized.execution.enabled.override=enable; + +DROP TABLE IF EXISTS base_tab; +CREATE TABLE base_tab(a STRING, b STRING) +ROW FORMAT DELIMITED + FIELDS TERMINATED BY '|'; +DESCRIBE EXTENDED base_tab; + +LOAD DATA LOCAL INPATH '../../data/files/escape_crlf.txt' OVERWRITE INTO TABLE base_tab; +-- No crlf escaping +SELECT * FROM base_tab; + +-- Crlf escaping +ALTER TABLE base_tab SET SERDEPROPERTIES ('escape.delim'='\\', 'serialization.escape.crlf'='true'); +SELECT * FROM base_tab; + +SET hive.fetch.task.conversion=none; +-- Make sure intermediate serde works correctly +SELECT * FROM base_tab; + +DROP TABLE base_tab; diff --git ql/src/test/results/clientpositive/vectorization_escape_crlf.q.out ql/src/test/results/clientpositive/vectorization_escape_crlf.q.out new file mode 100644 index 0000000..8b5df8c --- /dev/null +++ ql/src/test/results/clientpositive/vectorization_escape_crlf.q.out @@ -0,0 +1,92 @@ +PREHOOK: query: DROP TABLE IF EXISTS base_tab +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS base_tab +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE base_tab(a STRING, b STRING) +ROW FORMAT DELIMITED + FIELDS TERMINATED BY '|' +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@base_tab +POSTHOOK: query: CREATE TABLE base_tab(a STRING, b STRING) +ROW FORMAT DELIMITED + FIELDS TERMINATED BY '|' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@base_tab +PREHOOK: query: DESCRIBE EXTENDED base_tab +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@base_tab +POSTHOOK: query: DESCRIBE EXTENDED base_tab +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@base_tab +a string +b string + +#### A masked pattern was here #### +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/escape_crlf.txt' OVERWRITE INTO TABLE base_tab +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@base_tab +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/escape_crlf.txt' OVERWRITE INTO TABLE base_tab +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@base_tab +PREHOOK: query: SELECT * FROM base_tab +PREHOOK: type: QUERY +PREHOOK: Input: default@base_tab +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM base_tab +POSTHOOK: type: QUERY +POSTHOOK: Input: default@base_tab +#### A masked pattern was here #### +This\nis\rthe first\r\nmulti-line field field1-2 +This\nis\rthe second\r\nmulti-line field field2-2 +PREHOOK: query: ALTER TABLE base_tab SET SERDEPROPERTIES ('escape.delim'='\\', 'serialization.escape.crlf'='true') +PREHOOK: type: ALTERTABLE_SERDEPROPERTIES +PREHOOK: Input: default@base_tab +PREHOOK: Output: default@base_tab +POSTHOOK: query: ALTER TABLE base_tab SET SERDEPROPERTIES ('escape.delim'='\\', 'serialization.escape.crlf'='true') +POSTHOOK: type: ALTERTABLE_SERDEPROPERTIES +POSTHOOK: Input: default@base_tab +POSTHOOK: Output: default@base_tab +PREHOOK: query: SELECT * FROM base_tab +PREHOOK: type: QUERY +PREHOOK: Input: default@base_tab +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM base_tab +POSTHOOK: type: QUERY +POSTHOOK: Input: default@base_tab +#### A masked pattern was here #### +This +is +the first +multi-line field field1-2 +This +is +the second +multi-line field field2-2 +PREHOOK: query: SELECT * FROM base_tab +PREHOOK: type: QUERY +PREHOOK: Input: default@base_tab +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM base_tab +POSTHOOK: type: QUERY +POSTHOOK: Input: default@base_tab +#### A masked pattern was here #### +This +is +the first +multi-line field field1-2 +This +is +the second +multi-line field field2-2 +PREHOOK: query: DROP TABLE base_tab +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@base_tab +PREHOOK: Output: default@base_tab +POSTHOOK: query: DROP TABLE base_tab +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@base_tab +POSTHOOK: Output: default@base_tab diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java index dd88da8..1890f18 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java @@ -917,10 +917,11 @@ public void copyToExternalBuffer(byte[] externalBuffer, int externalBufferStart) private void copyToBuffer(byte[] buffer, int bufferStart, int bufferLength) { final int fieldStart = currentFieldStart; + final int fieldLength = currentFieldLength; int k = 0; - for (int i = 0; i < bufferLength; i++) { + for (int i = 0; i < fieldLength; i++) { byte b = bytes[fieldStart + i]; - if (b == escapeChar && i < bufferLength - 1) { + if (b == escapeChar && i < fieldLength - 1) { ++i; // Check if it's '\r' or '\n' if (bytes[fieldStart + i] == 'r') { diff --git serde/src/test/org/apache/hadoop/hive/serde2/lazy/fast/TestLazySimpleDeserializeRead.java serde/src/test/org/apache/hadoop/hive/serde2/lazy/fast/TestLazySimpleDeserializeRead.java new file mode 100644 index 0000000..a7873f2 --- /dev/null +++ serde/src/test/org/apache/hadoop/hive/serde2/lazy/fast/TestLazySimpleDeserializeRead.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.lazy.fast; + +import junit.framework.TestCase; + +import java.util.Properties; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters; +import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.io.Text; + +/** + * Unit tests for LazySimpleDeserializeRead. + * + */ +public class TestLazySimpleDeserializeRead extends TestCase { + + /** + * Test for escaping. + * + */ + public void testEscaping() throws Exception { + HiveConf hconf = new HiveConf(); + + // set the escaping related properties + Properties props = new Properties(); + props.setProperty(serdeConstants.FIELD_DELIM, "|"); + props.setProperty(serdeConstants.ESCAPE_CHAR, "\\"); + props.setProperty(serdeConstants.SERIALIZATION_ESCAPE_CRLF, "true"); + + LazySerDeParameters lazyParams = + new LazySerDeParameters(hconf, props, + LazySimpleSerDe.class.getName()); + + TypeInfo[] typeInfos = new TypeInfo[2]; + typeInfos[0] = TypeInfoFactory.getPrimitiveTypeInfo("string"); + typeInfos[1] = TypeInfoFactory.getPrimitiveTypeInfo("string"); + + LazySimpleDeserializeRead deserializeRead = + new LazySimpleDeserializeRead(typeInfos, null, true, lazyParams); + + // set and parse the row + String s = "This\\nis\\rthe first\\r\\nmulti-line field\\n|field1-2"; + Text row = new Text(s.getBytes("UTF-8")); + deserializeRead.set(row.getBytes(), 0, row.getLength()); + + assertTrue(deserializeRead.readNextField()); + assertTrue(deserializeRead.currentExternalBufferNeeded); + + int externalBufferLen = deserializeRead.currentExternalBufferNeededLen; + assertEquals("Wrong external buffer length", externalBufferLen, 36); + + byte[] externalBuffer = new byte[externalBufferLen]; + deserializeRead.copyToExternalBuffer(externalBuffer, 0); + + Text field = new Text(); + field.set(externalBuffer, 0, externalBufferLen); + + String f = "This\nis\rthe first\r\nmulti-line field\n"; + Text escaped = new Text(f.getBytes("UTF-8")); + + assertTrue("The escaped result is incorrect", field.compareTo(escaped) == 0); + } +}