diff --git ql/src/test/queries/clientnegative/serde_regex.q ql/src/test/queries/clientnegative/serde_regex.q index 6603b91..64ee1cc 100644 --- ql/src/test/queries/clientnegative/serde_regex.q +++ ql/src/test/queries/clientnegative/serde_regex.q @@ -1,15 +1,16 @@ USE default; --- This should fail because Regex SerDe supports only columns of type string +-- This should fail because Regex SerDe doesn't support TIMESTAMP, STRUCT CREATE TABLE serde_regex( host STRING, identity STRING, user STRING, - time STRING, + time TIMESTAMP, request STRING, status INT, size INT, referer STRING, - agent STRING) + agent STRING, + strct STRUCT) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' WITH SERDEPROPERTIES ( "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?") diff --git ql/src/test/queries/clientpositive/serde_regex.q ql/src/test/queries/clientpositive/serde_regex.q index c6809cb..c3254ca 100644 --- ql/src/test/queries/clientpositive/serde_regex.q +++ ql/src/test/queries/clientpositive/serde_regex.q @@ -6,7 +6,7 @@ CREATE TABLE serde_regex( time STRING, request STRING, status STRING, - size STRING, + size INT, referer STRING, agent STRING) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' @@ -22,7 +22,7 @@ CREATE TABLE serde_regex( time STRING, request STRING, status STRING, - size STRING, + size INT, referer STRING, agent STRING) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' diff --git ql/src/test/results/clientnegative/serde_regex.q.out ql/src/test/results/clientnegative/serde_regex.q.out index 03fe907..1c5b5fa 100644 --- ql/src/test/results/clientnegative/serde_regex.q.out +++ ql/src/test/results/clientnegative/serde_regex.q.out @@ -2,17 +2,18 @@ PREHOOK: query: USE default PREHOOK: type: SWITCHDATABASE POSTHOOK: query: USE default POSTHOOK: type: SWITCHDATABASE -PREHOOK: query: -- This should fail because Regex SerDe supports only columns of type string +PREHOOK: query: -- This should fail because Regex SerDe doesn't support TIMESTAMP, STRUCT CREATE TABLE serde_regex( host STRING, identity STRING, user STRING, - time STRING, + time TIMESTAMP, request STRING, status INT, size INT, referer STRING, - agent STRING) + agent STRING, + strct STRUCT) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' WITH SERDEPROPERTIES ( "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?") diff --git ql/src/test/results/clientpositive/serde_regex.q.out ql/src/test/results/clientpositive/serde_regex.q.out index a8ce604..a933538 100644 --- ql/src/test/results/clientpositive/serde_regex.q.out +++ ql/src/test/results/clientpositive/serde_regex.q.out @@ -6,7 +6,7 @@ CREATE TABLE serde_regex( time STRING, request STRING, status STRING, - size STRING, + size INT, referer STRING, agent STRING) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' @@ -23,7 +23,7 @@ CREATE TABLE serde_regex( time STRING, request STRING, status STRING, - size STRING, + size INT, referer STRING, agent STRING) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' @@ -33,7 +33,7 @@ WITH SERDEPROPERTIES ( STORED AS TEXTFILE POSTHOOK: type: CREATETABLE ABSTRACT SYNTAX TREE: - (TOK_CREATETABLE (TOK_TABNAME serde_regex) TOK_LIKETABLE (TOK_TABCOLLIST (TOK_TABCOL host TOK_STRING) (TOK_TABCOL identity TOK_STRING) (TOK_TABCOL user TOK_STRING) (TOK_TABCOL time TOK_STRING) (TOK_TABCOL request TOK_STRING) (TOK_TABCOL status TOK_STRING) (TOK_TABCOL size TOK_STRING) (TOK_TABCOL referer TOK_STRING) (TOK_TABCOL agent TOK_STRING)) (TOK_TABLESERIALIZER (TOK_SERDENAME 'org.apache.hadoop.hive.serde2.RegexSerDe' (TOK_TABLEPROPERTIES (TOK_TABLEPROPLIST (TOK_TABLEPROPERTY "input.regex" "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?"))))) TOK_TBLTEXTFILE) + (TOK_CREATETABLE (TOK_TABNAME serde_regex) TOK_LIKETABLE (TOK_TABCOLLIST (TOK_TABCOL host TOK_STRING) (TOK_TABCOL identity TOK_STRING) (TOK_TABCOL user TOK_STRING) (TOK_TABCOL time TOK_STRING) (TOK_TABCOL request TOK_STRING) (TOK_TABCOL status TOK_STRING) (TOK_TABCOL size TOK_INT) (TOK_TABCOL referer TOK_STRING) (TOK_TABCOL agent TOK_STRING)) (TOK_TABLESERIALIZER (TOK_SERDENAME 'org.apache.hadoop.hive.serde2.RegexSerDe' (TOK_TABLEPROPERTIES (TOK_TABLEPROPLIST (TOK_TABLEPROPERTY "input.regex" "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?"))))) TOK_TBLTEXTFILE) STAGE DEPENDENCIES: Stage-0 is a root stage @@ -42,7 +42,7 @@ STAGE PLANS: Stage: Stage-0 Create Table Operator: Create Table - columns: host string, identity string, user string, time string, request string, status string, size string, referer string, agent string + columns: host string, identity string, user string, time string, request string, status string, size int, referer string, agent string if not exists: false input format: org.apache.hadoop.mapred.TextInputFormat # buckets: -1 @@ -61,7 +61,7 @@ PREHOOK: query: CREATE TABLE serde_regex( time STRING, request STRING, status STRING, - size STRING, + size INT, referer STRING, agent STRING) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' @@ -77,7 +77,7 @@ POSTHOOK: query: CREATE TABLE serde_regex( time STRING, request STRING, status STRING, - size STRING, + size INT, referer STRING, agent STRING) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' diff --git serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java index e728244..ae7693a 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java +++ serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java @@ -33,7 +33,6 @@ import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; @@ -49,10 +48,11 @@ * expected groups, the missing groups will be NULL. If a row matches the regex * but has more than expected groups, the additional groups are just ignored. * - * NOTE: Obviously, all columns have to be strings. Users can use - * "CAST(a AS INT)" to convert columns to other types. + * NOTE: Regex SerDe supports primitive column types such as TINYINT, SMALLINT, + * INT, BIGINT, FLOAT, DOUBLE, STRING and BOOLEAN * - * NOTE: This implementation is using String, and javaStringObjectInspector. A + * + * NOTE: This implementation uses javaStringObjectInspector for STRING. A * more efficient implementation should use UTF-8 encoded Text and * writableStringObjectInspector. We should switch to that when we have a UTF-8 * based Regex library. @@ -67,7 +67,8 @@ Pattern inputPattern; StructObjectInspector rowOI; - ArrayList row; + List row; + List columnTypes; Object[] outputFields; Text outputRowText; @@ -104,34 +105,47 @@ public void initialize(Configuration conf, Properties tbl) List columnNames = Arrays.asList(columnNameProperty.split(",")); - List columnTypes = TypeInfoUtils + columnTypes = TypeInfoUtils .getTypeInfosFromTypeString(columnTypeProperty); assert columnNames.size() == columnTypes.size(); numColumns = columnNames.size(); - // All columns have to be of type STRING. + /* Constructing the row ObjectInspector: + * The row consists of some set of primitive columns, each column will + * be a java object of primitive type. + */ + List columnOIs = new ArrayList(columnNames.size()); for (int c = 0; c < numColumns; c++) { - if (!columnTypes.get(c).equals(TypeInfoFactory.stringTypeInfo)) { - throw new SerDeException(getClass().getName() - + " only accepts string columns, but column[" + c + "] named " - + columnNames.get(c) + " has type " + columnTypes.get(c)); - } - } + String typeName = columnTypes.get(c).getTypeName(); + if (typeName.equals(serdeConstants.STRING_TYPE_NAME)) { + columnOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); + } else if (typeName.equals(serdeConstants.TINYINT_TYPE_NAME)) { + columnOIs.add(PrimitiveObjectInspectorFactory.javaByteObjectInspector); + } else if (typeName.equals(serdeConstants.SMALLINT_TYPE_NAME)) { + columnOIs.add(PrimitiveObjectInspectorFactory.javaShortObjectInspector); + } else if (typeName.equals(serdeConstants.INT_TYPE_NAME)) { + columnOIs.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector); + } else if (typeName.equals(serdeConstants.BIGINT_TYPE_NAME)) { + columnOIs.add(PrimitiveObjectInspectorFactory.javaLongObjectInspector); + } else if (typeName.equals(serdeConstants.FLOAT_TYPE_NAME)) { + columnOIs.add(PrimitiveObjectInspectorFactory.javaFloatObjectInspector); + } else if (typeName.equals(serdeConstants.DOUBLE_TYPE_NAME)) { + columnOIs.add(PrimitiveObjectInspectorFactory.javaDoubleObjectInspector); + } else if (typeName.equals(serdeConstants.BOOLEAN_TYPE_NAME)) { + columnOIs.add(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector); + } else { + throw new SerDeException(getClass().getName() + + " doesn't allow column [" + c + "] named " + + columnNames.get(c) + " with type " + columnTypes.get(c)); + } + } - // Constructing the row ObjectInspector: - // The row consists of some string columns, each column will be a java - // String object. - List columnOIs = new ArrayList( - columnNames.size()); - for (int c = 0; c < numColumns; c++) { - columnOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); - } // StandardStruct uses ArrayList to store the row. rowOI = ObjectInspectorFactory.getStandardStructObjectInspector( columnNames, columnOIs); + row = new ArrayList(numColumns); // Constructing the row object, etc, which will be reused for all rows. - row = new ArrayList(numColumns); for (int c = 0; c < numColumns; c++) { row.add(null); } @@ -157,9 +171,7 @@ public ObjectInspector getObjectInspector() throws SerDeException { @Override public Object deserialize(Writable blob) throws SerDeException { - Text rowText = (Text) blob; - Matcher m = inputPattern.matcher(rowText.toString()); if (m.groupCount() != numColumns) { @@ -180,15 +192,49 @@ public Object deserialize(Writable blob) throws SerDeException { // Otherwise, return the row. for (int c = 0; c < numColumns; c++) { try { - row.set(c, m.group(c + 1)); + String t = m.group(c+1); + String typeName = columnTypes.get(c).getTypeName(); + + // Convert the column to the correct type when needed and set in row obj + if (typeName.equals(serdeConstants.STRING_TYPE_NAME)) { + row.set(c, t); + } else if (typeName.equals(serdeConstants.TINYINT_TYPE_NAME)) { + Byte b; + b = Byte.valueOf(t); + row.set(c,b); + } else if (typeName.equals(serdeConstants.SMALLINT_TYPE_NAME)) { + Short s; + s = Short.valueOf(t); + row.set(c,s); + } else if (typeName.equals(serdeConstants.INT_TYPE_NAME)) { + Integer i; + i = Integer.valueOf(t); + row.set(c, i); + } else if (typeName.equals(serdeConstants.BIGINT_TYPE_NAME)) { + Long l; + l = Long.valueOf(t); + row.set(c, l); + } else if (typeName.equals(serdeConstants.FLOAT_TYPE_NAME)) { + Float f; + f = Float.valueOf(t); + row.set(c,f); + } else if (typeName.equals(serdeConstants.DOUBLE_TYPE_NAME)) { + Double d; + d = Double.valueOf(t); + row.set(c,d); + } else if (typeName.equals(serdeConstants.BOOLEAN_TYPE_NAME)) { + Boolean b; + b = Boolean.valueOf(t); + row.set(c,b); + } } catch (RuntimeException e) { - partialMatchedRowsCount++; - if (!alreadyLoggedPartialMatch) { - // Report the row if its the first time - LOG.warn("" + partialMatchedRowsCount - + " partially unmatched rows are found, " + " cannot find group " - + c + ": " + rowText); - alreadyLoggedPartialMatch = true; + partialMatchedRowsCount++; + if (!alreadyLoggedPartialMatch) { + // Report the row if its the first row + LOG.warn("" + partialMatchedRowsCount + + " partially unmatched rows are found, " + " cannot find group " + + c + ": " + rowText); + alreadyLoggedPartialMatch = true; } row.set(c, null); } @@ -207,5 +253,4 @@ public SerDeStats getSerDeStats() { // no support for statistics return null; } - }