diff --git data/files/t1_multi_delimit.csv data/files/t1_multi_delimit.csv new file mode 100644 index 0000000000..6c4e729f42 --- /dev/null +++ data/files/t1_multi_delimit.csv @@ -0,0 +1,10 @@ +1^,1^,^,0^,0 +2^,1^,^,0^,1 +3^,1^,^,0^,0 +4^,1^,^,0^,1 +5^,5 + +7777 +8^,8^,^,8^,8^,8 +9^,9^,^,9^,9^,9^,9^,^,9^,9^,9 +10101010 \ No newline at end of file diff --git data/files/t2_multi_delimit.csv data/files/t2_multi_delimit.csv new file mode 100644 index 0000000000..0dd42e1dfb --- /dev/null +++ data/files/t2_multi_delimit.csv @@ -0,0 +1,4 @@ +1^,1^,^,0^,0^,0 +2^,1^,^,0^,1^,0 +3^,1^,^,0^,0^,0 +4^,1^,^,0^,1^,0 diff --git data/files/t3_multi_delimit.csv data/files/t3_multi_delimit.csv new file mode 100644 index 0000000000..8c49f6f383 --- /dev/null +++ data/files/t3_multi_delimit.csv @@ -0,0 +1,10 @@ +1^^^^^1^^^^^^^^^^0^^^^^0 +2^^^^^1^^^^^^^^^^0^^^^^1 +3^^^^^1^^^^^^^^^^0^^^^^0 +4^^^^^1^^^^^^^^^^0^^^^^1 +5^^^^^5 + +7777 +8^^^^^8^^^^^^^^^^8^^^^^8^^^^^8 +9^^^^^9^^^^^^^^^^9^^^^^9^^^^^9 +10101010 \ No newline at end of file diff --git ql/src/test/queries/clientpositive/serde_multi_delimit.q ql/src/test/queries/clientpositive/serde_multi_delimit.q new file mode 100644 index 0000000000..5ea1b711c6 --- /dev/null +++ ql/src/test/queries/clientpositive/serde_multi_delimit.q @@ -0,0 +1,43 @@ +-- in this table, rows of different lengths(different number of columns) are loaded +CREATE TABLE t1_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../../data/files/t1_multi_delimit.csv" INTO TABLE t1_multi_delimit; + +SELECT * FROM t1_multi_delimit; + +-- in this table, file having extra column is loaded +CREATE TABLE t2_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../../data/files/t2_multi_delimit.csv" INTO TABLE t2_multi_delimit; + +SELECT * FROM t2_multi_delimit; + +-- in this table, delimiter of 5 characters is used +CREATE TABLE t3_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^^^^^")STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../../data/files/t3_multi_delimit.csv" INTO TABLE t3_multi_delimit; + +SELECT * FROM t3_multi_delimit; + + +DROP TABLE t1_multi_delimit; +DROP TABLE t2_multi_delimit; +DROP TABLE t3_multi_delimit; \ No newline at end of file diff --git ql/src/test/results/clientpositive/serde_multi_delimit.q.out ql/src/test/results/clientpositive/serde_multi_delimit.q.out new file mode 100644 index 0000000000..c9f671ceca --- /dev/null +++ ql/src/test/results/clientpositive/serde_multi_delimit.q.out @@ -0,0 +1,156 @@ +PREHOOK: query: CREATE TABLE t1_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t1_multi_delimit +POSTHOOK: query: CREATE TABLE t1_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t1_multi_delimit +PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t1_multi_delimit.csv" INTO TABLE t1_multi_delimit +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t1_multi_delimit +POSTHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t1_multi_delimit.csv" INTO TABLE t1_multi_delimit +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t1_multi_delimit +PREHOOK: query: SELECT * FROM t1_multi_delimit +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_multi_delimit +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM t1_multi_delimit +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_multi_delimit +#### A masked pattern was here #### +1 1 NULL 0 0 +2 1 NULL 0 1 +3 1 NULL 0 0 +4 1 NULL 0 1 +5 5 NULL NULL NULL +NULL NULL NULL NULL NULL +7777 NULL NULL NULL NULL +8 8 NULL 8 8 +9 9 NULL 9 9 +10101010 NULL NULL NULL NULL +PREHOOK: query: CREATE TABLE t2_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t2_multi_delimit +POSTHOOK: query: CREATE TABLE t2_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t2_multi_delimit +PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t2_multi_delimit.csv" INTO TABLE t2_multi_delimit +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t2_multi_delimit +POSTHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t2_multi_delimit.csv" INTO TABLE t2_multi_delimit +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t2_multi_delimit +PREHOOK: query: SELECT * FROM t2_multi_delimit +PREHOOK: type: QUERY +PREHOOK: Input: default@t2_multi_delimit +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM t2_multi_delimit +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2_multi_delimit +#### A masked pattern was here #### +1 1 NULL 0 0 +2 1 NULL 0 1 +3 1 NULL 0 0 +4 1 NULL 0 1 +PREHOOK: query: CREATE TABLE t3_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^^^^^")STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t3_multi_delimit +POSTHOOK: query: CREATE TABLE t3_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^^^^^")STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t3_multi_delimit +PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t3_multi_delimit.csv" INTO TABLE t3_multi_delimit +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t3_multi_delimit +POSTHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t3_multi_delimit.csv" INTO TABLE t3_multi_delimit +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t3_multi_delimit +PREHOOK: query: SELECT * FROM t3_multi_delimit +PREHOOK: type: QUERY +PREHOOK: Input: default@t3_multi_delimit +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM t3_multi_delimit +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t3_multi_delimit +#### A masked pattern was here #### +1 1 NULL 0 0 +2 1 NULL 0 1 +3 1 NULL 0 0 +4 1 NULL 0 1 +5 5 NULL NULL NULL +NULL NULL NULL NULL NULL +7777 NULL NULL NULL NULL +8 8 NULL 8 8 +9 9 NULL 9 9 +10101010 NULL NULL NULL NULL +PREHOOK: query: DROP TABLE t1_multi_delimit +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t1_multi_delimit +PREHOOK: Output: default@t1_multi_delimit +POSTHOOK: query: DROP TABLE t1_multi_delimit +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t1_multi_delimit +POSTHOOK: Output: default@t1_multi_delimit +PREHOOK: query: DROP TABLE t2_multi_delimit +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t2_multi_delimit +PREHOOK: Output: default@t2_multi_delimit +POSTHOOK: query: DROP TABLE t2_multi_delimit +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t2_multi_delimit +POSTHOOK: Output: default@t2_multi_delimit +PREHOOK: query: DROP TABLE t3_multi_delimit +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t3_multi_delimit +PREHOOK: Output: default@t3_multi_delimit +POSTHOOK: query: DROP TABLE t3_multi_delimit +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t3_multi_delimit +POSTHOOK: Output: default@t3_multi_delimit diff --git serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java index d7d0d87bf0..fc0d47064b 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java +++ serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java @@ -90,6 +90,8 @@ private final ByteStream.Output serializeStream = new ByteStream.Output(); // The Writable to return in serialize private final Text serializeCache = new Text(); + // pattern for delimiter + private Pattern delimiterPattern; @Override public void initialize(Configuration conf, Properties tbl) throws SerDeException { @@ -101,7 +103,7 @@ public void initialize(Configuration conf, Properties tbl) throws SerDeException if (fieldDelimited == null || fieldDelimited.isEmpty()) { throw new SerDeException("This table does not have serde property \"field.delim\"!"); } - + delimiterPattern = Pattern.compile(fieldDelimited, Pattern.LITERAL); // get the collection separator and map key separator // TODO: use serdeConstants.COLLECTION_DELIM when the typo is fixed collSep = LazyUtils.getByte(tbl.getProperty(COLLECTION_DELIM), @@ -157,7 +159,7 @@ public Object doDeserialize(Writable blob) throws SerDeException { byteArrayRef.setData(rowStr.replaceAll(Pattern.quote(fieldDelimited), "\1").getBytes()); cachedLazyStruct.init(byteArrayRef, 0, byteArrayRef.getData().length); // use the multi-char delimiter to parse the lazy struct - cachedLazyStruct.parseMultiDelimit(rowStr.getBytes(), fieldDelimited.getBytes()); + cachedLazyStruct.parseMultiDelimit(rowStr, delimiterPattern); return cachedLazyStruct; } diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java index f066aaa3bf..7c70fc6688 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java @@ -20,8 +20,9 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; -import com.google.common.primitives.Bytes; import org.apache.hadoop.hive.serde2.SerDeException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -279,7 +280,7 @@ public long getRawDataSerializedSize() { } // parse the struct using multi-char delimiter - public void parseMultiDelimit(byte[] rawRow, byte[] fieldDelimit) { + public void parseMultiDelimit(final String rawRow, final Pattern fieldDelimit) { if (rawRow == null || fieldDelimit == null) { return; } @@ -292,45 +293,55 @@ public void parseMultiDelimit(byte[] rawRow, byte[] fieldDelimit) { fieldInited = new boolean[fields.length]; startPosition = new int[fields.length + 1]; } + final int delimiterLength = fieldDelimit.toString().length(); // the indexes of the delimiters - int[] delimitIndexes = findIndexes(rawRow, fieldDelimit); - int diff = fieldDelimit.length - 1; + List delimitIndices = findIndexes(rawRow, fieldDelimit); + int diff = delimiterLength - 1; + // first field always starts from 0, even when missing startPosition[0] = 0; for (int i = 1; i < fields.length; i++) { - if (delimitIndexes[i - 1] != -1) { - int start = delimitIndexes[i - 1] + fieldDelimit.length; + if (delimitIndices.get(i - 1) != -1) { + int start = delimitIndices.get(i - 1) + delimiterLength; startPosition[i] = start - i * diff; } else { startPosition[i] = length + 1; } } - startPosition[fields.length] = length + 1; + + // calculation of length of complete record with fields.length number of fields + final int totalRecordLength; + final int fieldLength = fields.length; + // this means we have more delimiters(and hence columns) than required (ideally n fields should have n-1 delimiters) + if (delimitIndices.size() >= fieldLength) { + // MultiDelimitSerDe replaces actual multi-char delimiter by "\1" which reduces the length + // however here we are getting rawRow with original multi-char delimiter + // due to this we have to subtract those extra chars to match length of LazyNonPrimitive#bytes which are used + // while reading data, see uncheckedGetField() + totalRecordLength = (delimitIndices.get(fieldLength - 1) + delimiterLength) - fieldLength * diff; + LOG.warn("More delimiters[{}] found than expected[{}]. Ignoring bytes after extra delimiters", delimiterLength, + fieldLength - 1); + } else { + totalRecordLength = length + 1; + } + + startPosition[fields.length] = totalRecordLength; Arrays.fill(fieldInited, false); parsed = true; } // find all the indexes of the sub byte[] - private int[] findIndexes(byte[] array, byte[] target) { - if (fields.length <= 1) { - return new int[0]; - } - int[] indexes = new int[fields.length - 1]; - Arrays.fill(indexes, -1); - indexes[0] = Bytes.indexOf(array, target); - if (indexes[0] == -1) { - return indexes; + private List findIndexes(final String text, final Pattern pattern) { + List delimiterIndices = new ArrayList<>(); + Matcher matcher = pattern.matcher(text); + while (matcher.find()) { + delimiterIndices.add(matcher.start()); } - int indexInNewArray = indexes[0]; - for (int i = 1; i < indexes.length; i++) { - array = Arrays.copyOfRange(array, indexInNewArray + target.length, array.length); - indexInNewArray = Bytes.indexOf(array, target); - if (indexInNewArray == -1) { - break; - } - indexes[i] = indexInNewArray + indexes[i - 1] + target.length; + // ideally n fields should have (n - 1) delimiters + for (int i = delimiterIndices.size(); i < fields.length - 1; i++) { + delimiterIndices.add(-1); } - return indexes; + return delimiterIndices; } /**