diff --git data/files/bigint.txt data/files/bigint.txt new file mode 100644 index 0000000..eb34d1d --- /dev/null +++ data/files/bigint.txt @@ -0,0 +1,2 @@ +46436262797987 +9223372036854775808 diff --git data/files/bool.txt data/files/bool.txt new file mode 100644 index 0000000..c508d53 --- /dev/null +++ data/files/bool.txt @@ -0,0 +1 @@ +false diff --git data/files/boolean.txt data/files/boolean.txt new file mode 100644 index 0000000..7674f7a --- /dev/null +++ data/files/boolean.txt @@ -0,0 +1,2 @@ +true +untrue diff --git data/files/boolint.txt data/files/boolint.txt new file mode 100644 index 0000000..bf5206f --- /dev/null +++ data/files/boolint.txt @@ -0,0 +1,3 @@ +false 25 +untrue 34 +true 9223372036854775 diff --git data/files/double.txt data/files/double.txt new file mode 100644 index 0000000..e0a53e2 --- /dev/null +++ data/files/double.txt @@ -0,0 +1,2 @@ +4363.464363426 +-656.757 diff --git data/files/float.txt data/files/float.txt new file mode 100644 index 0000000..2d7ead9 --- /dev/null +++ data/files/float.txt @@ -0,0 +1,2 @@ +35.6346 +-54.767 diff --git data/files/int.txt data/files/int.txt new file mode 100644 index 0000000..35c9ee4 --- /dev/null +++ data/files/int.txt @@ -0,0 +1,3 @@ +123636356 +2147483648 +-500 diff --git data/files/int2.txt data/files/int2.txt new file mode 100644 index 0000000..81cab8d --- /dev/null +++ data/files/int2.txt @@ -0,0 +1 @@ +54.55 diff --git data/files/mixedtype.txt data/files/mixedtype.txt new file mode 100644 index 0000000..024a708 --- /dev/null +++ data/files/mixedtype.txt @@ -0,0 +1,3 @@ +testsomething 356326.44 false +testsomemore 65 untrue +testmore -66.2 true diff --git data/files/smallint.txt data/files/smallint.txt new file mode 100644 index 0000000..ca83f26 --- /dev/null +++ data/files/smallint.txt @@ -0,0 +1,2 @@ +32765 +33379 diff --git data/files/stringbool.txt data/files/stringbool.txt new file mode 100644 index 0000000..a06fc58 --- /dev/null +++ data/files/stringbool.txt @@ -0,0 +1 @@ +testsomething 34 diff --git data/files/stringfloat.txt data/files/stringfloat.txt new file mode 100644 index 0000000..3a6b917 --- /dev/null +++ data/files/stringfloat.txt @@ -0,0 +1,3 @@ +testsomething 56.6456 +testmorestuff -565.65 +moretesting 77 diff --git data/files/stringint.txt data/files/stringint.txt new file mode 100644 index 0000000..b41a351 --- /dev/null +++ data/files/stringint.txt @@ -0,0 +1,3 @@ +testsomething 34 +testmore 565.44 +testsomemore 9223372036854775808 diff --git data/files/tinyint.txt data/files/tinyint.txt new file mode 100644 index 0000000..e2cca8a --- /dev/null +++ data/files/tinyint.txt @@ -0,0 +1,2 @@ +101 +2776 diff --git ql/src/test/queries/clientnegative/serde_regex.q ql/src/test/queries/clientnegative/serde_regex.q index 6603b91..64ee1cc 100644 --- ql/src/test/queries/clientnegative/serde_regex.q +++ ql/src/test/queries/clientnegative/serde_regex.q @@ -1,15 +1,16 @@ USE default; --- This should fail because Regex SerDe supports only columns of type string +-- This should fail because Regex SerDe doesn't support TIMESTAMP, STRUCT CREATE TABLE serde_regex( host STRING, identity STRING, user STRING, - time STRING, + time TIMESTAMP, request STRING, status INT, size INT, referer STRING, - agent STRING) + agent STRING, + strct STRUCT) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' WITH SERDEPROPERTIES ( "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?") diff --git ql/src/test/queries/clientpositive/serde_regex.q ql/src/test/queries/clientpositive/serde_regex.q index 306cbc5..41eda20 100644 --- ql/src/test/queries/clientpositive/serde_regex.q +++ ql/src/test/queries/clientpositive/serde_regex.q @@ -6,7 +6,7 @@ CREATE TABLE serde_regex( time STRING, request STRING, status STRING, - size STRING, + size INT, referer STRING, agent STRING) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' @@ -22,7 +22,7 @@ CREATE TABLE serde_regex( time STRING, request STRING, status STRING, - size STRING, + size INT, referer STRING, agent STRING) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' @@ -41,3 +41,189 @@ SELECT host, size, status from serde_regex; SELECT time from serde_regex; DROP TABLE serde_regex; + +-- test tinyint + +CREATE TABLE serde_regex( + value tinyint ) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "^(0?[0-9]?[0-9]|1[0-1][0-9]|12[0-7])$" +) +STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../data/files/tinyint.txt" INTO TABLE serde_regex; + +SELECT value from serde_regex; + +DROP TABLE serde_regex; + +-- test smallint + +CREATE TABLE serde_regex( + value smallint ) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([-+]?[0-9]+)" +) +STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../data/files/smallint.txt" INTO TABLE serde_regex; + +SELECT value from serde_regex; + +DROP TABLE serde_regex; + +-- test int + +CREATE TABLE serde_regex( + value INT) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([-+]?[0-9]+)" +) +STORED AS TEXTFILE; +LOAD DATA LOCAL INPATH "../data/files/int.txt" INTO TABLE serde_regex; + +SELECT value from serde_regex; + +DROP TABLE serde_regex; + +-- test bigint + +CREATE TABLE serde_regex( + value bigint ) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([-+]?[0-9]+)" +) +STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../data/files/bigint.txt" INTO TABLE serde_regex; + +SELECT value from serde_regex; + +DROP TABLE serde_regex; + +-- test float + +CREATE TABLE serde_regex( + value float ) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([-+]?[0-9]*\.?[0-9]+)" +) +STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../data/files/float.txt" INTO TABLE serde_regex; + +SELECT value from serde_regex; + +DROP TABLE serde_regex; + +-- test double + +CREATE TABLE serde_regex( + value double) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([-+]?[0-9]*\.?[0-9]+)" +) +STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../data/files/double.txt" INTO TABLE serde_regex; + +SELECT value from serde_regex; + +DROP TABLE serde_regex; + +-- test boolean + +CREATE TABLE serde_regex( + value boolean ) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "(true|false)" +) +STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../data/files/boolean.txt" INTO TABLE serde_regex; + +SELECT value from serde_regex; + +DROP TABLE serde_regex; + +-- test combination of column types + +CREATE TABLE serde_regex( + value1 boolean, + value2 int) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "(false|true) ([-+]?[0-9]+)" +) +STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../data/files/boolint.txt" INTO TABLE serde_regex; + +SELECT * from serde_regex; + +SELECT value1 from serde_regex; + +DROP TABLE serde_regex; + + +CREATE TABLE serde_regex( + value1 string, + value2 int) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([a-z]+) ([-+]?[0-9]+)" +) +STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../data/files/stringint.txt" INTO TABLE serde_regex; + +SELECT * from serde_regex; + +SELECT value1 from serde_regex; + +SELECT value2 from serde_regex; + +DROP TABLE serde_regex; + +CREATE TABLE serde_regex( + value1 string, + value2 float) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([a-z]+) ([-+]?[0-9]*\.?[0-9]+)" +) +STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../data/files/stringfloat.txt" INTO TABLE serde_regex; + +SELECT * from serde_regex; + +SELECT value1 from serde_regex; + +SELECT value2 from serde_regex; + +DROP TABLE serde_regex; + + +CREATE TABLE serde_regex( + value1 string, + value2 float, + value3 boolean) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([a-z]+) ([-+]?[0-9]*\.?[0-9]+) (false|true)" +) +STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../data/files/mixedtype.txt" INTO TABLE serde_regex; + +SELECT * from serde_regex; + +SELECT value2, value3 from serde_regex; \ No newline at end of file diff --git ql/src/test/results/clientnegative/serde_regex.q.out ql/src/test/results/clientnegative/serde_regex.q.out index 03fe907..1c5b5fa 100644 --- ql/src/test/results/clientnegative/serde_regex.q.out +++ ql/src/test/results/clientnegative/serde_regex.q.out @@ -2,17 +2,18 @@ PREHOOK: query: USE default PREHOOK: type: SWITCHDATABASE POSTHOOK: query: USE default POSTHOOK: type: SWITCHDATABASE -PREHOOK: query: -- This should fail because Regex SerDe supports only columns of type string +PREHOOK: query: -- This should fail because Regex SerDe doesn't support TIMESTAMP, STRUCT CREATE TABLE serde_regex( host STRING, identity STRING, user STRING, - time STRING, + time TIMESTAMP, request STRING, status INT, size INT, referer STRING, - agent STRING) + agent STRING, + strct STRUCT) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' WITH SERDEPROPERTIES ( "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?") diff --git ql/src/test/results/clientpositive/serde_regex.q.out ql/src/test/results/clientpositive/serde_regex.q.out index 1d53c68..df40bdc 100644 --- ql/src/test/results/clientpositive/serde_regex.q.out +++ ql/src/test/results/clientpositive/serde_regex.q.out @@ -6,7 +6,7 @@ CREATE TABLE serde_regex( time STRING, request STRING, status STRING, - size STRING, + size INT, referer STRING, agent STRING) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' @@ -23,7 +23,7 @@ CREATE TABLE serde_regex( time STRING, request STRING, status STRING, - size STRING, + size INT, referer STRING, agent STRING) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' @@ -33,7 +33,7 @@ WITH SERDEPROPERTIES ( STORED AS TEXTFILE POSTHOOK: type: CREATETABLE ABSTRACT SYNTAX TREE: - (TOK_CREATETABLE (TOK_TABNAME serde_regex) TOK_LIKETABLE (TOK_TABCOLLIST (TOK_TABCOL host TOK_STRING) (TOK_TABCOL identity TOK_STRING) (TOK_TABCOL user TOK_STRING) (TOK_TABCOL time TOK_STRING) (TOK_TABCOL request TOK_STRING) (TOK_TABCOL status TOK_STRING) (TOK_TABCOL size TOK_STRING) (TOK_TABCOL referer TOK_STRING) (TOK_TABCOL agent TOK_STRING)) (TOK_TABLESERIALIZER (TOK_SERDENAME 'org.apache.hadoop.hive.serde2.RegexSerDe' (TOK_TABLEPROPERTIES (TOK_TABLEPROPLIST (TOK_TABLEPROPERTY "input.regex" "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?"))))) TOK_TBLTEXTFILE) + (TOK_CREATETABLE (TOK_TABNAME serde_regex) TOK_LIKETABLE (TOK_TABCOLLIST (TOK_TABCOL host TOK_STRING) (TOK_TABCOL identity TOK_STRING) (TOK_TABCOL user TOK_STRING) (TOK_TABCOL time TOK_STRING) (TOK_TABCOL request TOK_STRING) (TOK_TABCOL status TOK_STRING) (TOK_TABCOL size TOK_INT) (TOK_TABCOL referer TOK_STRING) (TOK_TABCOL agent TOK_STRING)) (TOK_TABLESERIALIZER (TOK_SERDENAME 'org.apache.hadoop.hive.serde2.RegexSerDe' (TOK_TABLEPROPERTIES (TOK_TABLEPROPLIST (TOK_TABLEPROPERTY "input.regex" "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?"))))) TOK_TBLTEXTFILE) STAGE DEPENDENCIES: Stage-0 is a root stage @@ -42,7 +42,7 @@ STAGE PLANS: Stage: Stage-0 Create Table Operator: Create Table - columns: host string, identity string, user string, time string, request string, status string, size string, referer string, agent string + columns: host string, identity string, user string, time string, request string, status string, size int, referer string, agent string if not exists: false input format: org.apache.hadoop.mapred.TextInputFormat # buckets: -1 @@ -61,7 +61,7 @@ PREHOOK: query: CREATE TABLE serde_regex( time STRING, request STRING, status STRING, - size STRING, + size INT, referer STRING, agent STRING) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' @@ -77,7 +77,7 @@ POSTHOOK: query: CREATE TABLE serde_regex( time STRING, request STRING, status STRING, - size STRING, + size INT, referer STRING, agent STRING) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' @@ -137,3 +137,559 @@ POSTHOOK: query: DROP TABLE serde_regex POSTHOOK: type: DROPTABLE POSTHOOK: Input: default@serde_regex POSTHOOK: Output: default@serde_regex +PREHOOK: query: -- test tinyint + +CREATE TABLE serde_regex( + value tinyint ) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "^(0?[0-9]?[0-9]|1[0-1][0-9]|12[0-7])$" +) +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- test tinyint + +CREATE TABLE serde_regex( + value tinyint ) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "^(0?[0-9]?[0-9]|1[0-1][0-9]|12[0-7])$" +) +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@serde_regex +PREHOOK: query: LOAD DATA LOCAL INPATH "../data/files/tinyint.txt" INTO TABLE serde_regex +PREHOOK: type: LOAD +PREHOOK: Output: default@serde_regex +POSTHOOK: query: LOAD DATA LOCAL INPATH "../data/files/tinyint.txt" INTO TABLE serde_regex +POSTHOOK: type: LOAD +POSTHOOK: Output: default@serde_regex +PREHOOK: query: SELECT value from serde_regex +PREHOOK: type: QUERY +PREHOOK: Input: default@serde_regex +#### A masked pattern was here #### +POSTHOOK: query: SELECT value from serde_regex +POSTHOOK: type: QUERY +POSTHOOK: Input: default@serde_regex +#### A masked pattern was here #### +101 +NULL +PREHOOK: query: DROP TABLE serde_regex +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@serde_regex +PREHOOK: Output: default@serde_regex +POSTHOOK: query: DROP TABLE serde_regex +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@serde_regex +POSTHOOK: Output: default@serde_regex +PREHOOK: query: -- test smallint + +CREATE TABLE serde_regex( + value smallint ) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([-+]?[0-9]+)" +) +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- test smallint + +CREATE TABLE serde_regex( + value smallint ) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([-+]?[0-9]+)" +) +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@serde_regex +PREHOOK: query: LOAD DATA LOCAL INPATH "../data/files/smallint.txt" INTO TABLE serde_regex +PREHOOK: type: LOAD +PREHOOK: Output: default@serde_regex +POSTHOOK: query: LOAD DATA LOCAL INPATH "../data/files/smallint.txt" INTO TABLE serde_regex +POSTHOOK: type: LOAD +POSTHOOK: Output: default@serde_regex +PREHOOK: query: SELECT value from serde_regex +PREHOOK: type: QUERY +PREHOOK: Input: default@serde_regex +#### A masked pattern was here #### +POSTHOOK: query: SELECT value from serde_regex +POSTHOOK: type: QUERY +POSTHOOK: Input: default@serde_regex +#### A masked pattern was here #### +32765 +NULL +PREHOOK: query: DROP TABLE serde_regex +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@serde_regex +PREHOOK: Output: default@serde_regex +POSTHOOK: query: DROP TABLE serde_regex +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@serde_regex +POSTHOOK: Output: default@serde_regex +PREHOOK: query: -- test int + +CREATE TABLE serde_regex( + value INT) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([-+]?[0-9]+)" +) +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- test int + +CREATE TABLE serde_regex( + value INT) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([-+]?[0-9]+)" +) +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@serde_regex +PREHOOK: query: LOAD DATA LOCAL INPATH "../data/files/int.txt" INTO TABLE serde_regex +PREHOOK: type: LOAD +PREHOOK: Output: default@serde_regex +POSTHOOK: query: LOAD DATA LOCAL INPATH "../data/files/int.txt" INTO TABLE serde_regex +POSTHOOK: type: LOAD +POSTHOOK: Output: default@serde_regex +PREHOOK: query: SELECT value from serde_regex +PREHOOK: type: QUERY +PREHOOK: Input: default@serde_regex +#### A masked pattern was here #### +POSTHOOK: query: SELECT value from serde_regex +POSTHOOK: type: QUERY +POSTHOOK: Input: default@serde_regex +#### A masked pattern was here #### +123636356 +NULL +-500 +PREHOOK: query: DROP TABLE serde_regex +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@serde_regex +PREHOOK: Output: default@serde_regex +POSTHOOK: query: DROP TABLE serde_regex +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@serde_regex +POSTHOOK: Output: default@serde_regex +PREHOOK: query: -- test bigint + +CREATE TABLE serde_regex( + value bigint ) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([-+]?[0-9]+)" +) +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- test bigint + +CREATE TABLE serde_regex( + value bigint ) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([-+]?[0-9]+)" +) +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@serde_regex +PREHOOK: query: LOAD DATA LOCAL INPATH "../data/files/bigint.txt" INTO TABLE serde_regex +PREHOOK: type: LOAD +PREHOOK: Output: default@serde_regex +POSTHOOK: query: LOAD DATA LOCAL INPATH "../data/files/bigint.txt" INTO TABLE serde_regex +POSTHOOK: type: LOAD +POSTHOOK: Output: default@serde_regex +PREHOOK: query: SELECT value from serde_regex +PREHOOK: type: QUERY +PREHOOK: Input: default@serde_regex +#### A masked pattern was here #### +POSTHOOK: query: SELECT value from serde_regex +POSTHOOK: type: QUERY +POSTHOOK: Input: default@serde_regex +#### A masked pattern was here #### +46436262797987 +NULL +PREHOOK: query: DROP TABLE serde_regex +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@serde_regex +PREHOOK: Output: default@serde_regex +POSTHOOK: query: DROP TABLE serde_regex +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@serde_regex +POSTHOOK: Output: default@serde_regex +PREHOOK: query: -- test float + +CREATE TABLE serde_regex( + value float ) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([-+]?[0-9]*\.?[0-9]+)" +) +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- test float + +CREATE TABLE serde_regex( + value float ) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([-+]?[0-9]*\.?[0-9]+)" +) +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@serde_regex +PREHOOK: query: LOAD DATA LOCAL INPATH "../data/files/float.txt" INTO TABLE serde_regex +PREHOOK: type: LOAD +PREHOOK: Output: default@serde_regex +POSTHOOK: query: LOAD DATA LOCAL INPATH "../data/files/float.txt" INTO TABLE serde_regex +POSTHOOK: type: LOAD +POSTHOOK: Output: default@serde_regex +PREHOOK: query: SELECT value from serde_regex +PREHOOK: type: QUERY +PREHOOK: Input: default@serde_regex +#### A masked pattern was here #### +POSTHOOK: query: SELECT value from serde_regex +POSTHOOK: type: QUERY +POSTHOOK: Input: default@serde_regex +#### A masked pattern was here #### +35.6346 +-54.767 +PREHOOK: query: DROP TABLE serde_regex +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@serde_regex +PREHOOK: Output: default@serde_regex +POSTHOOK: query: DROP TABLE serde_regex +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@serde_regex +POSTHOOK: Output: default@serde_regex +PREHOOK: query: -- test double + +CREATE TABLE serde_regex( + value double) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([-+]?[0-9]*\.?[0-9]+)" +) +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- test double + +CREATE TABLE serde_regex( + value double) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([-+]?[0-9]*\.?[0-9]+)" +) +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@serde_regex +PREHOOK: query: LOAD DATA LOCAL INPATH "../data/files/double.txt" INTO TABLE serde_regex +PREHOOK: type: LOAD +PREHOOK: Output: default@serde_regex +POSTHOOK: query: LOAD DATA LOCAL INPATH "../data/files/double.txt" INTO TABLE serde_regex +POSTHOOK: type: LOAD +POSTHOOK: Output: default@serde_regex +PREHOOK: query: SELECT value from serde_regex +PREHOOK: type: QUERY +PREHOOK: Input: default@serde_regex +#### A masked pattern was here #### +POSTHOOK: query: SELECT value from serde_regex +POSTHOOK: type: QUERY +POSTHOOK: Input: default@serde_regex +#### A masked pattern was here #### +4363.464363426 +-656.757 +PREHOOK: query: DROP TABLE serde_regex +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@serde_regex +PREHOOK: Output: default@serde_regex +POSTHOOK: query: DROP TABLE serde_regex +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@serde_regex +POSTHOOK: Output: default@serde_regex +PREHOOK: query: -- test boolean + +CREATE TABLE serde_regex( + value boolean ) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "(true|false)" +) +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- test boolean + +CREATE TABLE serde_regex( + value boolean ) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "(true|false)" +) +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@serde_regex +PREHOOK: query: LOAD DATA LOCAL INPATH "../data/files/boolean.txt" INTO TABLE serde_regex +PREHOOK: type: LOAD +PREHOOK: Output: default@serde_regex +POSTHOOK: query: LOAD DATA LOCAL INPATH "../data/files/boolean.txt" INTO TABLE serde_regex +POSTHOOK: type: LOAD +POSTHOOK: Output: default@serde_regex +PREHOOK: query: SELECT value from serde_regex +PREHOOK: type: QUERY +PREHOOK: Input: default@serde_regex +#### A masked pattern was here #### +POSTHOOK: query: SELECT value from serde_regex +POSTHOOK: type: QUERY +POSTHOOK: Input: default@serde_regex +#### A masked pattern was here #### +true +NULL +PREHOOK: query: DROP TABLE serde_regex +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@serde_regex +PREHOOK: Output: default@serde_regex +POSTHOOK: query: DROP TABLE serde_regex +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@serde_regex +POSTHOOK: Output: default@serde_regex +PREHOOK: query: -- test combination of column types + +CREATE TABLE serde_regex( + value1 boolean, + value2 int) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "(false|true) ([-+]?[0-9]+)" +) +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- test combination of column types + +CREATE TABLE serde_regex( + value1 boolean, + value2 int) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "(false|true) ([-+]?[0-9]+)" +) +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@serde_regex +PREHOOK: query: LOAD DATA LOCAL INPATH "../data/files/boolint.txt" INTO TABLE serde_regex +PREHOOK: type: LOAD +PREHOOK: Output: default@serde_regex +POSTHOOK: query: LOAD DATA LOCAL INPATH "../data/files/boolint.txt" INTO TABLE serde_regex +POSTHOOK: type: LOAD +POSTHOOK: Output: default@serde_regex +PREHOOK: query: SELECT * from serde_regex +PREHOOK: type: QUERY +PREHOOK: Input: default@serde_regex +#### A masked pattern was here #### +POSTHOOK: query: SELECT * from serde_regex +POSTHOOK: type: QUERY +POSTHOOK: Input: default@serde_regex +#### A masked pattern was here #### +false 25 +NULL NULL +true NULL +PREHOOK: query: SELECT value1 from serde_regex +PREHOOK: type: QUERY +PREHOOK: Input: default@serde_regex +#### A masked pattern was here #### +POSTHOOK: query: SELECT value1 from serde_regex +POSTHOOK: type: QUERY +POSTHOOK: Input: default@serde_regex +#### A masked pattern was here #### +false +NULL +true +PREHOOK: query: DROP TABLE serde_regex +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@serde_regex +PREHOOK: Output: default@serde_regex +POSTHOOK: query: DROP TABLE serde_regex +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@serde_regex +POSTHOOK: Output: default@serde_regex +PREHOOK: query: CREATE TABLE serde_regex( + value1 string, + value2 int) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([a-z]+) ([-+]?[0-9]+)" +) +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE serde_regex( + value1 string, + value2 int) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([a-z]+) ([-+]?[0-9]+)" +) +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@serde_regex +PREHOOK: query: LOAD DATA LOCAL INPATH "../data/files/stringint.txt" INTO TABLE serde_regex +PREHOOK: type: LOAD +PREHOOK: Output: default@serde_regex +POSTHOOK: query: LOAD DATA LOCAL INPATH "../data/files/stringint.txt" INTO TABLE serde_regex +POSTHOOK: type: LOAD +POSTHOOK: Output: default@serde_regex +PREHOOK: query: SELECT * from serde_regex +PREHOOK: type: QUERY +PREHOOK: Input: default@serde_regex +#### A masked pattern was here #### +POSTHOOK: query: SELECT * from serde_regex +POSTHOOK: type: QUERY +POSTHOOK: Input: default@serde_regex +#### A masked pattern was here #### +testsomething 34 +NULL NULL +testsomemore NULL +PREHOOK: query: SELECT value1 from serde_regex +PREHOOK: type: QUERY +PREHOOK: Input: default@serde_regex +#### A masked pattern was here #### +POSTHOOK: query: SELECT value1 from serde_regex +POSTHOOK: type: QUERY +POSTHOOK: Input: default@serde_regex +#### A masked pattern was here #### +testsomething +NULL +testsomemore +PREHOOK: query: SELECT value2 from serde_regex +PREHOOK: type: QUERY +PREHOOK: Input: default@serde_regex +#### A masked pattern was here #### +POSTHOOK: query: SELECT value2 from serde_regex +POSTHOOK: type: QUERY +POSTHOOK: Input: default@serde_regex +#### A masked pattern was here #### +34 +NULL +NULL +PREHOOK: query: DROP TABLE serde_regex +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@serde_regex +PREHOOK: Output: default@serde_regex +POSTHOOK: query: DROP TABLE serde_regex +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@serde_regex +POSTHOOK: Output: default@serde_regex +PREHOOK: query: CREATE TABLE serde_regex( + value1 string, + value2 float) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([a-z]+) ([-+]?[0-9]*\.?[0-9]+)" +) +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE serde_regex( + value1 string, + value2 float) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([a-z]+) ([-+]?[0-9]*\.?[0-9]+)" +) +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@serde_regex +PREHOOK: query: LOAD DATA LOCAL INPATH "../data/files/stringfloat.txt" INTO TABLE serde_regex +PREHOOK: type: LOAD +PREHOOK: Output: default@serde_regex +POSTHOOK: query: LOAD DATA LOCAL INPATH "../data/files/stringfloat.txt" INTO TABLE serde_regex +POSTHOOK: type: LOAD +POSTHOOK: Output: default@serde_regex +PREHOOK: query: SELECT * from serde_regex +PREHOOK: type: QUERY +PREHOOK: Input: default@serde_regex +#### A masked pattern was here #### +POSTHOOK: query: SELECT * from serde_regex +POSTHOOK: type: QUERY +POSTHOOK: Input: default@serde_regex +#### A masked pattern was here #### +testsomething 56.6456 +testmorestuff -565.65 +moretesting 77.0 +PREHOOK: query: SELECT value1 from serde_regex +PREHOOK: type: QUERY +PREHOOK: Input: default@serde_regex +#### A masked pattern was here #### +POSTHOOK: query: SELECT value1 from serde_regex +POSTHOOK: type: QUERY +POSTHOOK: Input: default@serde_regex +#### A masked pattern was here #### +testsomething +testmorestuff +moretesting +PREHOOK: query: SELECT value2 from serde_regex +PREHOOK: type: QUERY +PREHOOK: Input: default@serde_regex +#### A masked pattern was here #### +POSTHOOK: query: SELECT value2 from serde_regex +POSTHOOK: type: QUERY +POSTHOOK: Input: default@serde_regex +#### A masked pattern was here #### +56.6456 +-565.65 +77.0 +PREHOOK: query: DROP TABLE serde_regex +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@serde_regex +PREHOOK: Output: default@serde_regex +POSTHOOK: query: DROP TABLE serde_regex +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@serde_regex +POSTHOOK: Output: default@serde_regex +PREHOOK: query: CREATE TABLE serde_regex( + value1 string, + value2 float, + value3 boolean) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([a-z]+) ([-+]?[0-9]*\.?[0-9]+) (false|true)" +) +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE serde_regex( + value1 string, + value2 float, + value3 boolean) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([a-z]+) ([-+]?[0-9]*\.?[0-9]+) (false|true)" +) +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@serde_regex +PREHOOK: query: LOAD DATA LOCAL INPATH "../data/files/mixedtype.txt" INTO TABLE serde_regex +PREHOOK: type: LOAD +PREHOOK: Output: default@serde_regex +POSTHOOK: query: LOAD DATA LOCAL INPATH "../data/files/mixedtype.txt" INTO TABLE serde_regex +POSTHOOK: type: LOAD +POSTHOOK: Output: default@serde_regex +PREHOOK: query: SELECT * from serde_regex +PREHOOK: type: QUERY +PREHOOK: Input: default@serde_regex +#### A masked pattern was here #### +POSTHOOK: query: SELECT * from serde_regex +POSTHOOK: type: QUERY +POSTHOOK: Input: default@serde_regex +#### A masked pattern was here #### +testsomething 356326.44 false +NULL NULL NULL +testmore -66.2 true +PREHOOK: query: SELECT value2, value3 from serde_regex +PREHOOK: type: QUERY +PREHOOK: Input: default@serde_regex +#### A masked pattern was here #### +POSTHOOK: query: SELECT value2, value3 from serde_regex +POSTHOOK: type: QUERY +POSTHOOK: Input: default@serde_regex +#### A masked pattern was here #### +356326.44 false +NULL NULL +-66.2 true diff --git serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java index 243710b..3669002 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java +++ serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java @@ -33,49 +33,51 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; /** - * RegexSerDe uses regular expression (regex) to serialize/deserialize. + * RegexSerDe uses regular expression (regex) to deserialize data. It doesn't + * support data serialization. * - * It can deserialize the data using regex and extracts groups as columns. It - * can also serialize the row object using a format string. + * It can deserialize the data using regex and extracts groups as columns. * - * In deserialization stage, if a row does not match the regex, then all columns - * in the row will be NULL. If a row matches the regex but has less than - * expected groups, the missing groups will be NULL. If a row matches the regex - * but has more than expected groups, the additional groups are just ignored. + * In deserialization stage, if the number of groups in a row don't match the number of columns in + * the table, an exception is raised. If a row does not match the regex, then all columns + * in the row will be NULL. If a row matches the regex but has less than expected groups because + * of a type conversion error, the missing groups will be NULL. * - * NOTE: Obviously, all columns have to be strings. Users can use - * "CAST(a AS INT)" to convert columns to other types. + * NOTE: Regex SerDe supports primitive column types such as TINYINT, SMALLINT, + * INT, BIGINT, FLOAT, DOUBLE, STRING and BOOLEAN * - * NOTE: This implementation is using String, and javaStringObjectInspector. A + * NOTE: This implementation uses javaStringObjectInspector for String type. A * more efficient implementation should use UTF-8 encoded Text and * writableStringObjectInspector. We should switch to that when we have a UTF-8 * based Regex library. */ + public class RegexSerDe implements SerDe { public static final Log LOG = LogFactory.getLog(RegexSerDe.class.getName()); int numColumns; String inputRegex; - String outputFormatString; Pattern inputPattern; StructObjectInspector rowOI; - ArrayList row; + Object[] outputFields; Text outputRowText; boolean alreadyLoggedNoMatch = false; boolean alreadyLoggedPartialMatch = false; - @Override + ArrayList row; + List columnTypes ; + + @Override public void initialize(Configuration conf, Properties tbl) throws SerDeException { @@ -83,14 +85,14 @@ public class RegexSerDe implements SerDe { // Read the configuration parameters inputRegex = tbl.getProperty("input.regex"); - outputFormatString = tbl.getProperty("output.format.string"); String columnNameProperty = tbl.getProperty(Constants.LIST_COLUMNS); String columnTypeProperty = tbl.getProperty(Constants.LIST_COLUMN_TYPES); + boolean inputRegexIgnoreCase = "true".equalsIgnoreCase(tbl .getProperty("input.regex.case.insensitive")); // output format string is not supported anymore, warn user of deprecation - if (outputFormatString != null) { + if (null != tbl.getProperty("output.format.string")) { LOG.warn("output.format.string has been deprecated"); } @@ -100,43 +102,56 @@ public class RegexSerDe implements SerDe { + (inputRegexIgnoreCase ? Pattern.CASE_INSENSITIVE : 0)); } else { inputPattern = null; - } - - // If inputPattern is null, raise an exception - if (inputPattern == null) { throw new SerDeException( "This table does not have serde property \"input.regex\"!"); } List columnNames = Arrays.asList(columnNameProperty.split(",")); - List columnTypes = TypeInfoUtils + columnTypes = TypeInfoUtils .getTypeInfosFromTypeString(columnTypeProperty); assert columnNames.size() == columnTypes.size(); numColumns = columnNames.size(); - // All columns have to be of type STRING. + /* Constructing the row ObjectInspector: + * The row consists of some set of primitive columns, each column will + * be a java object of primitive type. + */ + List columnOIs = new ArrayList( + columnNames.size()); + for (int c = 0; c < numColumns; c++) { - if (!columnTypes.get(c).equals(TypeInfoFactory.stringTypeInfo)) { + + String typeName = columnTypes.get(c).getTypeName(); + + if (typeName == Constants.STRING_TYPE_NAME) { + columnOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); + } else if (typeName == Constants.TINYINT_TYPE_NAME) { + columnOIs.add(PrimitiveObjectInspectorFactory.javaByteObjectInspector); + } else if (typeName == Constants.SMALLINT_TYPE_NAME) { + columnOIs.add(PrimitiveObjectInspectorFactory.javaShortObjectInspector); + } else if (typeName == Constants.INT_TYPE_NAME) { + columnOIs.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector); + } else if (typeName == Constants.BIGINT_TYPE_NAME) { + columnOIs.add(PrimitiveObjectInspectorFactory.javaLongObjectInspector); + } else if (typeName == Constants.FLOAT_TYPE_NAME) { + columnOIs.add(PrimitiveObjectInspectorFactory.javaFloatObjectInspector); + } else if (typeName == Constants.DOUBLE_TYPE_NAME) { + columnOIs.add(PrimitiveObjectInspectorFactory.javaDoubleObjectInspector); + } else if (typeName == Constants.BOOLEAN_TYPE_NAME) { + columnOIs.add(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector); + } else { throw new SerDeException(getClass().getName() - + " only accepts string columns, but column[" + c + "] named " - + columnNames.get(c) + " has type " + columnTypes.get(c)); + + " doesn't allow column [" + c + "] named " + + columnNames.get(c) + " with type " + columnTypes.get(c)); } } - // Constructing the row ObjectInspector: - // The row consists of some string columns, each column will be a java - // String object. - List columnOIs = new ArrayList( - columnNames.size()); - for (int c = 0; c < numColumns; c++) { - columnOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); - } // StandardStruct uses ArrayList to store the row. rowOI = ObjectInspectorFactory.getStandardStructObjectInspector( columnNames, columnOIs); // Constructing the row object, etc, which will be reused for all rows. - row = new ArrayList(numColumns); + row = new ArrayList(numColumns); for (int c = 0; c < numColumns; c++) { row.add(null); } @@ -155,22 +170,14 @@ public class RegexSerDe implements SerDe { } // Number of rows not matching the regex - long unmatchedRows = 0; - long nextUnmatchedRows = 1; + long unmatchedRowsCount = 0; // Number of rows that match the regex but have missing groups. - long partialMatchedRows = 0; - long nextPartialMatchedRows = 1; - - long getNextNumberToDisplay(long now) { - return now * 10; - } + long partialMatchedRowsCount = 0; @Override public Object deserialize(Writable blob) throws SerDeException { - Text rowText = (Text) blob; - Matcher m = inputPattern.matcher(rowText.toString()); if (m.groupCount() != numColumns) { @@ -179,15 +186,11 @@ public class RegexSerDe implements SerDe { // If do not match, ignore the line, return a row with all nulls. if (!m.matches()) { - unmatchedRows++; - if (unmatchedRows >= nextUnmatchedRows) { - nextUnmatchedRows = getNextNumberToDisplay(nextUnmatchedRows); - - if (alreadyLoggedNoMatch == false) { + unmatchedRowsCount++; + if (!alreadyLoggedNoMatch) { // Report the row if its the first time - LOG.warn("" + unmatchedRows + " unmatched rows are found: " + rowText); + LOG.warn("" + unmatchedRowsCount + " unmatched rows are found: " + rowText); alreadyLoggedNoMatch = true; - } } return null; } @@ -195,21 +198,53 @@ public class RegexSerDe implements SerDe { // Otherwise, return the row. for (int c = 0; c < numColumns; c++) { try { - row.set(c, m.group(c + 1)); + + String t = m.group(c+1); + String typeName = columnTypes.get(c).getTypeName(); + + // Convert the column to the correct type when needed and set in row obj + if (typeName == Constants.STRING_TYPE_NAME) { + row.set(c, t); + } else if (typeName == Constants.TINYINT_TYPE_NAME) { + Byte b; + b = Byte.valueOf(t); + row.set(c,b); + } else if (typeName == Constants.SMALLINT_TYPE_NAME) { + Short s; + s = Short.valueOf(t); + row.set(c,s); + } else if (typeName == Constants.INT_TYPE_NAME) { + Integer i; + i = Integer.valueOf(t); + row.set(c, i); + } else if (typeName == Constants.BIGINT_TYPE_NAME) { + Long l; + l = Long.valueOf(t); + row.set(c, l); + } else if (typeName == Constants.FLOAT_TYPE_NAME) { + Float f; + f = Float.valueOf(t); + row.set(c,f); + } else if (typeName == Constants.DOUBLE_TYPE_NAME) { + Double d; + d = Double.valueOf(t); + row.set(c,d); + } else if (typeName == Constants.BOOLEAN_TYPE_NAME) { + Boolean b; + b = Boolean.valueOf(t); + row.set(c,b); + } } catch (RuntimeException e) { - partialMatchedRows++; - if (partialMatchedRows >= nextPartialMatchedRows) { - nextPartialMatchedRows = getNextNumberToDisplay(nextPartialMatchedRows); - if (alreadyLoggedPartialMatch == false) { - // Report the row if its the first time - LOG.warn("" + partialMatchedRows - + " partially unmatched rows are found, " + " cannot find group " - + c + ": " + rowText); + partialMatchedRowsCount++; + if (!alreadyLoggedPartialMatch) { + // Report the row if its the first row + LOG.warn("" + partialMatchedRowsCount + + " partially unmatched rows are found, " + " cannot find group " + + c + ": " + rowText); alreadyLoggedPartialMatch = true; } row.set(c, null); } - } } return row; } @@ -217,7 +252,8 @@ public class RegexSerDe implements SerDe { @Override public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException { - throw new UnsupportedOperationException("Regex SerDe doesn't support the serialize() method"); + throw new UnsupportedOperationException( + "Regex SerDe doesn't support the serialize() method"); } public SerDeStats getSerDeStats() { @@ -225,4 +261,4 @@ public class RegexSerDe implements SerDe { return null; } -} +} \ No newline at end of file