diff --git data/files/data_with_escape.txt data/files/data_with_escape.txt new file mode 100644 index 0000000..bd9cc6e --- /dev/null +++ data/files/data_with_escape.txt @@ -0,0 +1,5 @@ +re\|ading|V\|A|100 +writ\|ing|MD|200 +w\|aiting|\|NC|300 +seein\|g|TN\||400 +runn\|ing|WV|500 diff --git ql/src/test/queries/clientpositive/escape3.q ql/src/test/queries/clientpositive/escape3.q new file mode 100644 index 0000000..192ee84 --- /dev/null +++ ql/src/test/queries/clientpositive/escape3.q @@ -0,0 +1,48 @@ +-- with string +CREATE TABLE escape3_1 +( +GERUND STRING, +ABBREV STRING, +CODE SMALLINT +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' ESCAPED BY '\134' +STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/data_with_escape.txt' INTO TABLE escape3_1; + +select * from escape3_1; + +-- with varchar +CREATE TABLE escape3_2 +( +GERUND VARCHAR(10), +ABBREV VARCHAR(3), +CODE SMALLINT +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' ESCAPED BY '\134' +STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/data_with_escape.txt' INTO TABLE escape3_2; + +select * from escape3_2; + +-- with char +CREATE TABLE escape3_3 +( +GERUND CHAR(10), +ABBREV CHAR(3), +CODE SMALLINT +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' ESCAPED BY '\134' +STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/data_with_escape.txt' INTO TABLE escape3_3; + +select * from escape3_3; + +DROP TABLE escape3_1; +DROP TABLE escape3_2; +DROP TABLE escape3_3; diff --git ql/src/test/results/clientpositive/escape3.q.out ql/src/test/results/clientpositive/escape3.q.out new file mode 100644 index 0000000..dc53583 --- /dev/null +++ ql/src/test/results/clientpositive/escape3.q.out @@ -0,0 +1,165 @@ +PREHOOK: query: -- with string +CREATE TABLE escape3_1 +( +GERUND STRING, +ABBREV STRING, +CODE SMALLINT +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' ESCAPED BY '\134' +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@escape3_1 +POSTHOOK: query: -- with string +CREATE TABLE escape3_1 +( +GERUND STRING, +ABBREV STRING, +CODE SMALLINT +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' ESCAPED BY '\134' +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@escape3_1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/data_with_escape.txt' INTO TABLE escape3_1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@escape3_1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/data_with_escape.txt' INTO TABLE escape3_1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@escape3_1 +PREHOOK: query: select * from escape3_1 +PREHOOK: type: QUERY +PREHOOK: Input: default@escape3_1 +#### A masked pattern was here #### +POSTHOOK: query: select * from escape3_1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@escape3_1 +#### A masked pattern was here #### +re|ading V|A 100 +writ|ing MD 200 +w|aiting |NC 300 +seein|g TN| 400 +runn|ing WV 500 +PREHOOK: query: -- with varchar +CREATE TABLE escape3_2 +( +GERUND VARCHAR(10), +ABBREV VARCHAR(3), +CODE SMALLINT +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' ESCAPED BY '\134' +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@escape3_2 +POSTHOOK: query: -- with varchar +CREATE TABLE escape3_2 +( +GERUND VARCHAR(10), +ABBREV VARCHAR(3), +CODE SMALLINT +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' ESCAPED BY '\134' +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@escape3_2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/data_with_escape.txt' INTO TABLE escape3_2 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@escape3_2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/data_with_escape.txt' INTO TABLE escape3_2 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@escape3_2 +PREHOOK: query: select * from escape3_2 +PREHOOK: type: QUERY +PREHOOK: Input: default@escape3_2 +#### A masked pattern was here #### +POSTHOOK: query: select * from escape3_2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@escape3_2 +#### A masked pattern was here #### +re|ading V|A 100 +writ|ing MD 200 +w|aiting |NC 300 +seein|g TN| 400 +runn|ing WV 500 +PREHOOK: query: -- with char +CREATE TABLE escape3_3 +( +GERUND CHAR(10), +ABBREV CHAR(3), +CODE SMALLINT +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' ESCAPED BY '\134' +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@escape3_3 +POSTHOOK: query: -- with char +CREATE TABLE escape3_3 +( +GERUND CHAR(10), +ABBREV CHAR(3), +CODE SMALLINT +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' ESCAPED BY '\134' +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@escape3_3 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/data_with_escape.txt' INTO TABLE escape3_3 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@escape3_3 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/data_with_escape.txt' INTO TABLE escape3_3 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@escape3_3 +PREHOOK: query: select * from escape3_3 +PREHOOK: type: QUERY +PREHOOK: Input: default@escape3_3 +#### A masked pattern was here #### +POSTHOOK: query: select * from escape3_3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@escape3_3 +#### A masked pattern was here #### +re|ading V|A 100 +writ|ing MD 200 +w|aiting |NC 300 +seein|g TN| 400 +runn|ing WV 500 +PREHOOK: query: DROP TABLE escape3_1 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@escape3_1 +PREHOOK: Output: default@escape3_1 +POSTHOOK: query: DROP TABLE escape3_1 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@escape3_1 +POSTHOOK: Output: default@escape3_1 +PREHOOK: query: DROP TABLE escape3_2 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@escape3_2 +PREHOOK: Output: default@escape3_2 +POSTHOOK: query: DROP TABLE escape3_2 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@escape3_2 +POSTHOOK: Output: default@escape3_2 +PREHOOK: query: DROP TABLE escape3_3 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@escape3_3 +PREHOOK: Output: default@escape3_3 +POSTHOOK: query: DROP TABLE escape3_3 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@escape3_3 +POSTHOOK: Output: default@escape3_3 diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyHiveChar.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyHiveChar.java index ef469eb..3799c7c 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyHiveChar.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyHiveChar.java @@ -55,14 +55,24 @@ public void setValue(LazyHiveChar copy) { @Override public void init(ByteArrayRef bytes, int start, int length) { - String byteData = null; - try { - byteData = Text.decode(bytes.getData(), start, length); - data.set(byteData, maxLength); + if (oi.isEscaped()) { + Text textData = data.getTextValue(); + // This is doing a lot of copying here, this could be improved by enforcing length + // at the same time as escaping rather than as separate steps. + LazyUtils.copyAndEscapeStringDataToText(bytes.getData(), start, length, + oi.getEscapeChar(),textData); + data.set(textData.toString(), maxLength); isNull = false; - } catch (CharacterCodingException e) { - isNull = true; - LOG.debug("Data not in the HiveChar data type range so converted to null.", e); + } else { + String byteData = null; + try { + byteData = Text.decode(bytes.getData(), start, length); + data.set(byteData, maxLength); + isNull = false; + } catch (CharacterCodingException e) { + isNull = true; + LOG.debug("Data not in the HiveChar data type range so converted to null.", e); + } } } diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyHiveVarchar.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyHiveVarchar.java index bc8d41e..b5e0920 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyHiveVarchar.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyHiveVarchar.java @@ -55,15 +55,25 @@ public void setValue(LazyHiveVarchar copy) { @Override public void init(ByteArrayRef bytes, int start, int length) { - String byteData = null; - try { - byteData = Text.decode(bytes.getData(), start, length); - data.set(byteData, maxLength); - isNull = false; - } catch (CharacterCodingException e) { - isNull = true; - LOG.debug("Data not in the HiveVarchar data type range so converted to null.", e); - } + if (oi.isEscaped()) { + Text textData = data.getTextValue(); + // This is doing a lot of copying here, this could be improved by enforcing length + // at the same time as escaping rather than as separate steps. + LazyUtils.copyAndEscapeStringDataToText(bytes.getData(), start, length, + oi.getEscapeChar(),textData); + data.set(textData.toString(), maxLength); + isNull = false; + } else { + try { + String byteData = null; + byteData = Text.decode(bytes.getData(), start, length); + data.set(byteData, maxLength); + isNull = false; + } catch (CharacterCodingException e) { + isNull = true; + LOG.debug("Data not in the HiveVarchar data type range so converted to null.", e); + } + } } } diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyString.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyString.java index 28b3f86..75b9556 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyString.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyString.java @@ -40,40 +40,7 @@ public void init(ByteArrayRef bytes, int start, int length) { if (oi.isEscaped()) { byte escapeChar = oi.getEscapeChar(); byte[] inputBytes = bytes.getData(); - - // First calculate the length of the output string - int outputLength = 0; - for (int i = 0; i < length; i++) { - if (inputBytes[start + i] != escapeChar) { - outputLength++; - } else { - outputLength++; - i++; - } - } - - // Copy the data over, so that the internal state of Text will be set to - // the required outputLength. - data.set(bytes.getData(), start, outputLength); - - // We need to copy the data byte by byte only in case the - // "outputLength < length" (which means there is at least one escaped - // byte. - if (outputLength < length) { - int k = 0; - byte[] outputBytes = data.getBytes(); - for (int i = 0; i < length; i++) { - byte b = inputBytes[start + i]; - if (b != escapeChar || i == length - 1) { - outputBytes[k++] = b; - } else { - // get the next byte - i++; - outputBytes[k++] = inputBytes[start + i]; - } - } - assert (k == outputLength); - } + LazyUtils.copyAndEscapeStringDataToText(inputBytes, start, length, escapeChar, data); } else { // if the data is not escaped, simply copy the data. data.set(bytes.getData(), start, length); diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java index 1d62422..3943508 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java @@ -412,6 +412,44 @@ static byte getSeparator(byte[] separators, int level) throws SerDeException { } } + public static void copyAndEscapeStringDataToText(byte[] inputBytes, int start, int length, + byte escapeChar, Text data) { + + // First calculate the length of the output string + int outputLength = 0; + for (int i = 0; i < length; i++) { + if (inputBytes[start + i] != escapeChar) { + outputLength++; + } else { + outputLength++; + i++; + } + } + + // Copy the data over, so that the internal state of Text will be set to + // the required outputLength. + data.set(inputBytes, start, outputLength); + + // We need to copy the data byte by byte only in case the + // "outputLength < length" (which means there is at least one escaped + // byte. + if (outputLength < length) { + int k = 0; + byte[] outputBytes = data.getBytes(); + for (int i = 0; i < length; i++) { + byte b = inputBytes[start + i]; + if (b != escapeChar || i == length - 1) { + outputBytes[k++] = b; + } else { + // get the next byte + i++; + outputBytes[k++] = inputBytes[start + i]; + } + } + assert (k == outputLength); + } + } + private LazyUtils() { // prevent instantiation } diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyHiveCharObjectInspector.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyHiveCharObjectInspector.java index 65fb1ab..2b0ad15 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyHiveCharObjectInspector.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyHiveCharObjectInspector.java @@ -29,12 +29,21 @@ extends AbstractPrimitiveLazyObjectInspector implements HiveCharObjectInspector { + private boolean escaped; + private byte escapeChar; + // no-arg ctor required for Kyro public LazyHiveCharObjectInspector() { } public LazyHiveCharObjectInspector(CharTypeInfo typeInfo) { + this(typeInfo, false, (byte)0); + } + + public LazyHiveCharObjectInspector(CharTypeInfo typeInfo, boolean escaped, byte escapeChar) { super(typeInfo); + this.escaped = escaped; + this.escapeChar = escapeChar; } @Override @@ -63,6 +72,14 @@ public HiveChar getPrimitiveJavaObject(Object o) { return ret; } + public boolean isEscaped() { + return escaped; + } + + public byte getEscapeChar() { + return escapeChar; + } + @Override public String toString() { return getTypeName(); diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyHiveVarcharObjectInspector.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyHiveVarcharObjectInspector.java index c802ed0..8e7acce 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyHiveVarcharObjectInspector.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyHiveVarcharObjectInspector.java @@ -29,12 +29,21 @@ extends AbstractPrimitiveLazyObjectInspector implements HiveVarcharObjectInspector { + private boolean escaped; + private byte escapeChar; + // no-arg ctor required for Kyro public LazyHiveVarcharObjectInspector() { } public LazyHiveVarcharObjectInspector(VarcharTypeInfo typeInfo) { + this(typeInfo, false, (byte)0); + } + + public LazyHiveVarcharObjectInspector(VarcharTypeInfo typeInfo, boolean escaped, byte escapeChar) { super(typeInfo); + this.escaped = escaped; + this.escapeChar = escapeChar; } @Override @@ -63,6 +72,14 @@ public HiveVarchar getPrimitiveJavaObject(Object o) { return ret; } + public boolean isEscaped() { + return escaped; + } + + public byte getEscapeChar() { + return escapeChar; + } + @Override public String toString() { return getTypeName(); diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyPrimitiveObjectInspectorFactory.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyPrimitiveObjectInspectorFactory.java index 734b9d8..d376ee5 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyPrimitiveObjectInspectorFactory.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyPrimitiveObjectInspectorFactory.java @@ -121,6 +121,10 @@ private LazyPrimitiveObjectInspectorFactory() { switch(primitiveCategory) { case STRING: return getLazyStringObjectInspector(escaped, escapeChar); + case CHAR: + return new LazyHiveCharObjectInspector((CharTypeInfo)typeInfo, escaped, escapeChar); + case VARCHAR: + return new LazyHiveVarcharObjectInspector((VarcharTypeInfo)typeInfo, escaped, escapeChar); case BOOLEAN: return getLazyBooleanObjectInspector(extBoolean); default: