diff --git data/files/t11_csv_serde.csv data/files/t11_csv_serde.csv new file mode 100644 index 0000000000..6e7060919e --- /dev/null +++ data/files/t11_csv_serde.csv @@ -0,0 +1,10 @@ +1,1,,0,0 +2,1,,0,1 +3,1,,0,0 +4,1,,0,1 +5,5 + +7777 +8,8,,8,8,8 +9,9,,9,9,9,9,,9,9,9 +10101010 \ No newline at end of file diff --git data/files/t1_multi_delimit.csv data/files/t1_multi_delimit.csv new file mode 100644 index 0000000000..6c4e729f42 --- /dev/null +++ data/files/t1_multi_delimit.csv @@ -0,0 +1,10 @@ +1^,1^,^,0^,0 +2^,1^,^,0^,1 +3^,1^,^,0^,0 +4^,1^,^,0^,1 +5^,5 + +7777 +8^,8^,^,8^,8^,8 +9^,9^,^,9^,9^,9^,9^,^,9^,9^,9 +10101010 \ No newline at end of file diff --git data/files/t2_multi_delimit.csv data/files/t2_multi_delimit.csv new file mode 100644 index 0000000000..0dd42e1dfb --- /dev/null +++ data/files/t2_multi_delimit.csv @@ -0,0 +1,4 @@ +1^,1^,^,0^,0^,0 +2^,1^,^,0^,1^,0 +3^,1^,^,0^,0^,0 +4^,1^,^,0^,1^,0 diff --git data/files/t3_multi_delimit.csv data/files/t3_multi_delimit.csv new file mode 100644 index 0000000000..8c49f6f383 --- /dev/null +++ data/files/t3_multi_delimit.csv @@ -0,0 +1,10 @@ +1^^^^^1^^^^^^^^^^0^^^^^0 +2^^^^^1^^^^^^^^^^0^^^^^1 +3^^^^^1^^^^^^^^^^0^^^^^0 +4^^^^^1^^^^^^^^^^0^^^^^1 +5^^^^^5 + +7777 +8^^^^^8^^^^^^^^^^8^^^^^8^^^^^8 +9^^^^^9^^^^^^^^^^9^^^^^9^^^^^9 +10101010 \ No newline at end of file diff --git ql/src/test/queries/clientpositive/serde_multi_delimit.q ql/src/test/queries/clientpositive/serde_multi_delimit.q new file mode 100644 index 0000000000..0d85175286 --- /dev/null +++ ql/src/test/queries/clientpositive/serde_multi_delimit.q @@ -0,0 +1,65 @@ +-- in this table, rows of different lengths(different number of columns) are loaded +CREATE TABLE t1_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../../data/files/t1_multi_delimit.csv" INTO TABLE t1_multi_delimit; + +SELECT * FROM t1_multi_delimit; + +-- in this table, rows of different lengths(different number of columns) and it uses csv serde +CREATE TABLE t11_csv_serde(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' +WITH SERDEPROPERTIES ("separatorChar" = ",")STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../../data/files/t11_csv_serde.csv" INTO TABLE t11_csv_serde; + +SELECT * FROM t11_csv_serde; + +-- there should not be any difference between MultiDelimitSerDe table and OpenCSVSerde table results + +SELECT EXISTS ( +SELECT colA, colB, colC, colD, colE FROM t1_multi_delimit +MINUS +SELECT cast(colA as int), cast(colB as tinyint), cast(colC as timestamp), cast(colD as smallint), cast(colE as smallint) FROM t11_csv_serde +); + +-- in this table, file having extra column is loaded +CREATE TABLE t2_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../../data/files/t2_multi_delimit.csv" INTO TABLE t2_multi_delimit; + +SELECT * FROM t2_multi_delimit; + +-- in this table, delimiter of 5 characters is used +CREATE TABLE t3_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^^^^^")STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../../data/files/t3_multi_delimit.csv" INTO TABLE t3_multi_delimit; + +SELECT * FROM t3_multi_delimit; + + +DROP TABLE t1_multi_delimit; +DROP TABLE t11_csv_serde; +DROP TABLE t2_multi_delimit; +DROP TABLE t3_multi_delimit; \ No newline at end of file diff --git ql/src/test/results/clientpositive/serde_multi_delimit.q.out ql/src/test/results/clientpositive/serde_multi_delimit.q.out new file mode 100644 index 0000000000..f13aa59d5a --- /dev/null +++ ql/src/test/results/clientpositive/serde_multi_delimit.q.out @@ -0,0 +1,232 @@ +PREHOOK: query: CREATE TABLE t1_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t1_multi_delimit +POSTHOOK: query: CREATE TABLE t1_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t1_multi_delimit +PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t1_multi_delimit.csv" INTO TABLE t1_multi_delimit +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t1_multi_delimit +POSTHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t1_multi_delimit.csv" INTO TABLE t1_multi_delimit +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t1_multi_delimit +PREHOOK: query: SELECT * FROM t1_multi_delimit +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_multi_delimit +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM t1_multi_delimit +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_multi_delimit +#### A masked pattern was here #### +1 1 NULL 0 0 +2 1 NULL 0 1 +3 1 NULL 0 0 +4 1 NULL 0 1 +5 5 NULL NULL NULL +NULL NULL NULL NULL NULL +7777 NULL NULL NULL NULL +8 8 NULL 8 8 +9 9 NULL 9 9 +10101010 NULL NULL NULL NULL +PREHOOK: query: CREATE TABLE t11_csv_serde(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' +WITH SERDEPROPERTIES ("separatorChar" = ",")STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t11_csv_serde +POSTHOOK: query: CREATE TABLE t11_csv_serde(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' +WITH SERDEPROPERTIES ("separatorChar" = ",")STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t11_csv_serde +PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t11_csv_serde.csv" INTO TABLE t11_csv_serde +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t11_csv_serde +POSTHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t11_csv_serde.csv" INTO TABLE t11_csv_serde +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t11_csv_serde +PREHOOK: query: SELECT * FROM t11_csv_serde +PREHOOK: type: QUERY +PREHOOK: Input: default@t11_csv_serde +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM t11_csv_serde +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t11_csv_serde +#### A masked pattern was here #### +1 1 0 0 +2 1 0 1 +3 1 0 0 +4 1 0 1 +5 5 NULL NULL NULL +NULL NULL NULL NULL NULL +7777 NULL NULL NULL NULL +8 8 8 8 +9 9 9 9 +10101010 NULL NULL NULL NULL +Warning: Shuffle Join JOIN[30][tables = [$hdt$_0, $hdt$_1]] in Stage 'Stage-1:MAPRED' is a cross product +PREHOOK: query: SELECT EXISTS ( +SELECT colA, colB, colC, colD, colE FROM t1_multi_delimit +MINUS +SELECT cast(colA as int), cast(colB as tinyint), cast(colC as timestamp), cast(colD as smallint), cast(colE as smallint) FROM t11_csv_serde +) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Input: default@t11_csv_serde +PREHOOK: Input: default@t1_multi_delimit +#### A masked pattern was here #### +POSTHOOK: query: SELECT EXISTS ( +SELECT colA, colB, colC, colD, colE FROM t1_multi_delimit +MINUS +SELECT cast(colA as int), cast(colB as tinyint), cast(colC as timestamp), cast(colD as smallint), cast(colE as smallint) FROM t11_csv_serde +) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Input: default@t11_csv_serde +POSTHOOK: Input: default@t1_multi_delimit +#### A masked pattern was here #### +false +PREHOOK: query: CREATE TABLE t2_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t2_multi_delimit +POSTHOOK: query: CREATE TABLE t2_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t2_multi_delimit +PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t2_multi_delimit.csv" INTO TABLE t2_multi_delimit +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t2_multi_delimit +POSTHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t2_multi_delimit.csv" INTO TABLE t2_multi_delimit +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t2_multi_delimit +PREHOOK: query: SELECT * FROM t2_multi_delimit +PREHOOK: type: QUERY +PREHOOK: Input: default@t2_multi_delimit +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM t2_multi_delimit +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2_multi_delimit +#### A masked pattern was here #### +1 1 NULL 0 0 +2 1 NULL 0 1 +3 1 NULL 0 0 +4 1 NULL 0 1 +PREHOOK: query: CREATE TABLE t3_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^^^^^")STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t3_multi_delimit +POSTHOOK: query: CREATE TABLE t3_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^^^^^")STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t3_multi_delimit +PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t3_multi_delimit.csv" INTO TABLE t3_multi_delimit +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t3_multi_delimit +POSTHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t3_multi_delimit.csv" INTO TABLE t3_multi_delimit +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t3_multi_delimit +PREHOOK: query: SELECT * FROM t3_multi_delimit +PREHOOK: type: QUERY +PREHOOK: Input: default@t3_multi_delimit +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM t3_multi_delimit +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t3_multi_delimit +#### A masked pattern was here #### +1 1 NULL 0 0 +2 1 NULL 0 1 +3 1 NULL 0 0 +4 1 NULL 0 1 +5 5 NULL NULL NULL +NULL NULL NULL NULL NULL +7777 NULL NULL NULL NULL +8 8 NULL 8 8 +9 9 NULL 9 9 +10101010 NULL NULL NULL NULL +PREHOOK: query: DROP TABLE t1_multi_delimit +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t1_multi_delimit +PREHOOK: Output: default@t1_multi_delimit +POSTHOOK: query: DROP TABLE t1_multi_delimit +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t1_multi_delimit +POSTHOOK: Output: default@t1_multi_delimit +PREHOOK: query: DROP TABLE t11_csv_serde +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t11_csv_serde +PREHOOK: Output: default@t11_csv_serde +POSTHOOK: query: DROP TABLE t11_csv_serde +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t11_csv_serde +POSTHOOK: Output: default@t11_csv_serde +PREHOOK: query: DROP TABLE t2_multi_delimit +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t2_multi_delimit +PREHOOK: Output: default@t2_multi_delimit +POSTHOOK: query: DROP TABLE t2_multi_delimit +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t2_multi_delimit +POSTHOOK: Output: default@t2_multi_delimit +PREHOOK: query: DROP TABLE t3_multi_delimit +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t3_multi_delimit +PREHOOK: Output: default@t3_multi_delimit +POSTHOOK: query: DROP TABLE t3_multi_delimit +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t3_multi_delimit +POSTHOOK: Output: default@t3_multi_delimit diff --git serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java index d7d0d87bf0..efe6597ffb 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java +++ serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java @@ -69,6 +69,9 @@ // Due to HIVE-6404, define our own constant private static final String COLLECTION_DELIM = "collection.delim"; + // actual delimiter(fieldDelimited) is replaced by REPLACEMENT_DELIM in row. + private static final String REPLACEMENT_DELIM = "\1"; + private int numColumns; private String fieldDelimited; // we don't support using multiple chars as delimiters within complex types @@ -90,6 +93,8 @@ private final ByteStream.Output serializeStream = new ByteStream.Output(); // The Writable to return in serialize private final Text serializeCache = new Text(); + // pattern for delimiter + private Pattern delimiterPattern; @Override public void initialize(Configuration conf, Properties tbl) throws SerDeException { @@ -101,7 +106,7 @@ public void initialize(Configuration conf, Properties tbl) throws SerDeException if (fieldDelimited == null || fieldDelimited.isEmpty()) { throw new SerDeException("This table does not have serde property \"field.delim\"!"); } - + delimiterPattern = Pattern.compile(fieldDelimited, Pattern.LITERAL); // get the collection separator and map key separator // TODO: use serdeConstants.COLLECTION_DELIM when the typo is fixed collSep = LazyUtils.getByte(tbl.getProperty(COLLECTION_DELIM), @@ -154,10 +159,10 @@ public Object doDeserialize(Writable blob) throws SerDeException { } else { throw new SerDeException(getClass() + ": expects either BytesWritable or Text object!"); } - byteArrayRef.setData(rowStr.replaceAll(Pattern.quote(fieldDelimited), "\1").getBytes()); + byteArrayRef.setData(rowStr.replaceAll(Pattern.quote(fieldDelimited), REPLACEMENT_DELIM).getBytes()); cachedLazyStruct.init(byteArrayRef, 0, byteArrayRef.getData().length); // use the multi-char delimiter to parse the lazy struct - cachedLazyStruct.parseMultiDelimit(rowStr.getBytes(), fieldDelimited.getBytes()); + cachedLazyStruct.parseMultiDelimit(rowStr, delimiterPattern, REPLACEMENT_DELIM); return cachedLazyStruct; } diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java index f066aaa3bf..3db1ce7798 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java @@ -20,8 +20,9 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; -import com.google.common.primitives.Bytes; import org.apache.hadoop.hive.serde2.SerDeException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -278,8 +279,14 @@ public long getRawDataSerializedSize() { return serializedSize; } - // parse the struct using multi-char delimiter - public void parseMultiDelimit(byte[] rawRow, byte[] fieldDelimit) { + /** + * Parses rawRow using multi-char delimiter. + * + * @param rawRow row to be parsed, delimited by fieldDelimit + * @param fieldDelimit pattern of multi-char delimiter + * @param replacementDelim delimiter with which fieldDelimit has been replaced in rawRow + */ + public void parseMultiDelimit(final String rawRow, final Pattern fieldDelimit, final String replacementDelim) { if (rawRow == null || fieldDelimit == null) { return; } @@ -292,45 +299,62 @@ public void parseMultiDelimit(byte[] rawRow, byte[] fieldDelimit) { fieldInited = new boolean[fields.length]; startPosition = new int[fields.length + 1]; } - // the indexes of the delimiters - int[] delimitIndexes = findIndexes(rawRow, fieldDelimit); - int diff = fieldDelimit.length - 1; + final int delimiterLength = fieldDelimit.toString().length(); + // the indices of the delimiters + final List delimitIndices = findDelimiterIndicesInRow(rawRow, fieldDelimit); + // first field always starts from 0, even when missing startPosition[0] = 0; for (int i = 1; i < fields.length; i++) { - if (delimitIndexes[i - 1] != -1) { - int start = delimitIndexes[i - 1] + fieldDelimit.length; - startPosition[i] = start - i * diff; + if (delimitIndices.get(i - 1) != -1) { + startPosition[i] = + getStartPositionWRTReplacementDelim(i, delimitIndices, delimiterLength, replacementDelim.length()); } else { startPosition[i] = length + 1; } } - startPosition[fields.length] = length + 1; + + // calculation of length of complete record with fields.length number of fields + final int totalRecordLength; + final int fieldLength = fields.length; + // this means we have more delimiters(and hence columns) than required (ideally n fields should have n-1 delimiters) + if (delimitIndices.size() >= fieldLength) { + totalRecordLength = + getStartPositionWRTReplacementDelim(fieldLength, delimitIndices, delimiterLength, replacementDelim.length()); + LOG.warn("More delimiters[{}] found than expected[{}]. Ignoring bytes after extra delimiters", delimiterLength, + fieldLength - 1); + } else { + totalRecordLength = length + 1; + } + + startPosition[fieldLength] = totalRecordLength; Arrays.fill(fieldInited, false); parsed = true; } - // find all the indexes of the sub byte[] - private int[] findIndexes(byte[] array, byte[] target) { - if (fields.length <= 1) { - return new int[0]; - } - int[] indexes = new int[fields.length - 1]; - Arrays.fill(indexes, -1); - indexes[0] = Bytes.indexOf(array, target); - if (indexes[0] == -1) { - return indexes; + // MultiDelimitSerDe replaces actual multi-char delimiter by replacementDelim("\1") which reduces the length + // however here we are getting rawRow with original multi-char delimiter + // due to this we have to subtract those extra chars to match length of LazyNonPrimitive#bytes which are used + // while reading data, see uncheckedGetField() + private int getStartPositionWRTReplacementDelim(final int startPosIndex, final List delimitIndices, + final int delimiterLength, final int replacementDelimLength) { + final int extraBytesInDelim = delimiterLength - replacementDelimLength; + return (delimitIndices.get(startPosIndex - 1) + delimiterLength) - startPosIndex * extraBytesInDelim; + } + + // find all the indices of the delimiter in row + // and if row contains less delimiters than expected, fill the rest with -1 + private List findDelimiterIndicesInRow(final String row, final Pattern delimiter) { + List delimiterIndices = new ArrayList<>(); + Matcher matcher = delimiter.matcher(row); + while (matcher.find()) { + delimiterIndices.add(matcher.start()); } - int indexInNewArray = indexes[0]; - for (int i = 1; i < indexes.length; i++) { - array = Arrays.copyOfRange(array, indexInNewArray + target.length, array.length); - indexInNewArray = Bytes.indexOf(array, target); - if (indexInNewArray == -1) { - break; - } - indexes[i] = indexInNewArray + indexes[i - 1] + target.length; + // ideally n fields should have (n - 1) delimiters + for (int i = delimiterIndices.size(); i < fields.length - 1; i++) { + delimiterIndices.add(-1); } - return indexes; + return delimiterIndices; } /**