diff --git a/data/files/parquet_mixed_case b/data/files/parquet_mixed_case new file mode 100644 index 0000000..f6af0f5 Binary files /dev/null and b/data/files/parquet_mixed_case differ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java index c100dce..dc51efd 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java @@ -26,6 +26,7 @@ import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.util.StringUtils; +import parquet.column.ColumnDescriptor; import parquet.hadoop.api.ReadSupport; import parquet.io.api.RecordMaterializer; import parquet.schema.MessageType; @@ -46,8 +47,8 @@ private static final String TABLE_SCHEMA = "table_schema"; public static final String HIVE_SCHEMA_KEY = "HIVE_TABLE_SCHEMA"; - public static final String PARQUET_COLUMN_INDEX_ACCESS = "parquet.column.index.access"; - + public static final String PARQUET_COLUMN_INDEX_ACCESS = "parquet.column.index.access"; + /** * From a string which columns names (including hive column), return a list * of string columns @@ -75,12 +76,16 @@ final Map contextMetadata = new HashMap(); if (columns != null) { final List listColumns = getColumns(columns); - + final Map lowerCaseFileSchemaColumns = new HashMap(); + for (ColumnDescriptor c : fileSchema.getColumns()) { + lowerCaseFileSchemaColumns.put(c.getPath()[0].toLowerCase(), c.getPath()[0]); + } final List typeListTable = new ArrayList(); - for (final String col : listColumns) { + for (String col : listColumns) { + col = col.toLowerCase(); // listColumns contains partition columns which are metadata only - if (fileSchema.containsField(col)) { - typeListTable.add(fileSchema.getType(col)); + if (lowerCaseFileSchemaColumns.containsKey(col)) { + typeListTable.add(fileSchema.getType(lowerCaseFileSchemaColumns.get(col))); } else { // below allows schema evolution typeListTable.add(new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, col)); @@ -93,10 +98,21 @@ final List indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration); final List typeListWanted = new ArrayList(); + final boolean indexAccess = configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false); for (final Integer idx : indexColumnsWanted) { - typeListWanted.add(tableSchema.getType(listColumns.get(idx))); + String col = listColumns.get(idx); + if (indexAccess) { + typeListWanted.add(tableSchema.getType(col)); + } else if (lowerCaseFileSchemaColumns.containsKey(col = col.toLowerCase())) { + typeListWanted.add(tableSchema.getType(lowerCaseFileSchemaColumns.get(col))); + } else { + // should never occur? + String msg = "Column " + col + " at index " + idx + " does not exist in " + + lowerCaseFileSchemaColumns; + throw new IllegalStateException(msg); + } } - requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(), + requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(), typeListWanted), fileSchema, configuration); return new ReadContext(requestedSchemaByUser, contextMetadata); @@ -127,29 +143,24 @@ } final MessageType tableSchema = resolveSchemaAccess(MessageTypeParser. parseMessageType(metadata.get(HIVE_SCHEMA_KEY)), fileSchema, configuration); - return new DataWritableRecordConverter(readContext.getRequestedSchema(), tableSchema); } - + /** - * Determine the file column names based on the position within the requested columns and + * Determine the file column names based on the position within the requested columns and * use that as the requested schema. */ - private MessageType resolveSchemaAccess(MessageType requestedSchema, MessageType fileSchema, + private MessageType resolveSchemaAccess(MessageType requestedSchema, MessageType fileSchema, Configuration configuration) { - if(configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false)) { + if (configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false)) { final List listColumns = getColumns(configuration.get(IOConstants.COLUMNS)); - List requestedTypes = new ArrayList(); - for(Type t : requestedSchema.getFields()) { int index = listColumns.indexOf(t.getName()); requestedTypes.add(fileSchema.getType(index)); } - requestedSchema = new MessageType(requestedSchema.getName(), requestedTypes); } - return requestedSchema; } } diff --git a/ql/src/test/queries/clientpositive/parquet_columnar.q b/ql/src/test/queries/clientpositive/parquet_columnar.q index 4303d3e..b7886a8 100644 --- a/ql/src/test/queries/clientpositive/parquet_columnar.q +++ b/ql/src/test/queries/clientpositive/parquet_columnar.q @@ -13,15 +13,16 @@ CREATE TABLE parquet_columnar_access_stage ( CREATE TABLE parquet_columnar_access ( s string, - i int, + x int, + y int, f float ) STORED AS PARQUET; LOAD DATA LOCAL INPATH '../../data/files/parquet_columnar.txt' OVERWRITE INTO TABLE parquet_columnar_access_stage; -INSERT OVERWRITE TABLE parquet_columnar_access SELECT * FROM parquet_columnar_access_stage; +INSERT OVERWRITE TABLE parquet_columnar_access SELECT s, i, (i + 1), f FROM parquet_columnar_access_stage; SELECT * FROM parquet_columnar_access; -ALTER TABLE parquet_columnar_access REPLACE COLUMNS (s1 string, i1 int, f1 float); +ALTER TABLE parquet_columnar_access REPLACE COLUMNS (s1 string, x1 int, y1 int, f1 float); SELECT * FROM parquet_columnar_access; diff --git a/ql/src/test/queries/clientpositive/parquet_mixed_case.q b/ql/src/test/queries/clientpositive/parquet_mixed_case.q new file mode 100644 index 0000000..95fc958 --- /dev/null +++ b/ql/src/test/queries/clientpositive/parquet_mixed_case.q @@ -0,0 +1,13 @@ +DROP TABLE parquet_mixed_case; + +CREATE TABLE parquet_mixed_case ( + lowerCase string, + UPPERcase string, + stats bigint, + moreuppercase string, + MORELOWERCASE string +) STORED AS PARQUET; + +LOAD DATA LOCAL INPATH '../../data/files/parquet_mixed_case' OVERWRITE INTO TABLE parquet_mixed_case; + +SELECT lowercase, "|", uppercase, "|", stats, "|", moreuppercase, "|", morelowercase FROM parquet_mixed_case; diff --git a/ql/src/test/results/clientpositive/parquet_columnar.q.out b/ql/src/test/results/clientpositive/parquet_columnar.q.out index 2222ff9..a4ed372 100644 --- a/ql/src/test/results/clientpositive/parquet_columnar.q.out +++ b/ql/src/test/results/clientpositive/parquet_columnar.q.out @@ -29,14 +29,16 @@ POSTHOOK: Output: database:default POSTHOOK: Output: default@parquet_columnar_access_stage PREHOOK: query: CREATE TABLE parquet_columnar_access ( s string, - i int, + x int, + y int, f float ) STORED AS PARQUET PREHOOK: type: CREATETABLE PREHOOK: Output: database:default POSTHOOK: query: CREATE TABLE parquet_columnar_access ( s string, - i int, + x int, + y int, f float ) STORED AS PARQUET POSTHOOK: type: CREATETABLE @@ -50,17 +52,18 @@ POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_columnar.txt' POSTHOOK: type: LOAD #### A masked pattern was here #### POSTHOOK: Output: default@parquet_columnar_access_stage -PREHOOK: query: INSERT OVERWRITE TABLE parquet_columnar_access SELECT * FROM parquet_columnar_access_stage +PREHOOK: query: INSERT OVERWRITE TABLE parquet_columnar_access SELECT s, i, (i + 1), f FROM parquet_columnar_access_stage PREHOOK: type: QUERY PREHOOK: Input: default@parquet_columnar_access_stage PREHOOK: Output: default@parquet_columnar_access -POSTHOOK: query: INSERT OVERWRITE TABLE parquet_columnar_access SELECT * FROM parquet_columnar_access_stage +POSTHOOK: query: INSERT OVERWRITE TABLE parquet_columnar_access SELECT s, i, (i + 1), f FROM parquet_columnar_access_stage POSTHOOK: type: QUERY POSTHOOK: Input: default@parquet_columnar_access_stage POSTHOOK: Output: default@parquet_columnar_access POSTHOOK: Lineage: parquet_columnar_access.f SIMPLE [(parquet_columnar_access_stage)parquet_columnar_access_stage.FieldSchema(name:f, type:float, comment:null), ] -POSTHOOK: Lineage: parquet_columnar_access.i SIMPLE [(parquet_columnar_access_stage)parquet_columnar_access_stage.FieldSchema(name:i, type:int, comment:null), ] POSTHOOK: Lineage: parquet_columnar_access.s SIMPLE [(parquet_columnar_access_stage)parquet_columnar_access_stage.FieldSchema(name:s, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_columnar_access.x SIMPLE [(parquet_columnar_access_stage)parquet_columnar_access_stage.FieldSchema(name:i, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_columnar_access.y EXPRESSION [(parquet_columnar_access_stage)parquet_columnar_access_stage.FieldSchema(name:i, type:int, comment:null), ] PREHOOK: query: SELECT * FROM parquet_columnar_access PREHOOK: type: QUERY PREHOOK: Input: default@parquet_columnar_access @@ -69,32 +72,32 @@ POSTHOOK: query: SELECT * FROM parquet_columnar_access POSTHOOK: type: QUERY POSTHOOK: Input: default@parquet_columnar_access #### A masked pattern was here #### -1abc00 1 1.0 -1def01 2 1.1 -1ghi02 3 1.2 -1jkl03 1 1.3 -1mno04 2 1.4 -1pqr05 3 1.0 -1stu06 1 1.1 -1vwx07 2 1.2 -1yza08 3 1.3 -1bcd09 1 1.4 -1efg10 2 1.0 -1hij11 3 1.1 -1klm12 1 1.2 -1nop13 2 1.3 -1qrs14 3 1.4 -1tuv15 1 1.0 -1wxy16 2 1.1 -1zab17 3 1.2 -1cde18 1 1.3 -1fgh19 2 1.4 -1ijk20 3 1.0 -PREHOOK: query: ALTER TABLE parquet_columnar_access REPLACE COLUMNS (s1 string, i1 int, f1 float) +1abc00 1 2 1.0 +1def01 2 3 1.1 +1ghi02 3 4 1.2 +1jkl03 1 2 1.3 +1mno04 2 3 1.4 +1pqr05 3 4 1.0 +1stu06 1 2 1.1 +1vwx07 2 3 1.2 +1yza08 3 4 1.3 +1bcd09 1 2 1.4 +1efg10 2 3 1.0 +1hij11 3 4 1.1 +1klm12 1 2 1.2 +1nop13 2 3 1.3 +1qrs14 3 4 1.4 +1tuv15 1 2 1.0 +1wxy16 2 3 1.1 +1zab17 3 4 1.2 +1cde18 1 2 1.3 +1fgh19 2 3 1.4 +1ijk20 3 4 1.0 +PREHOOK: query: ALTER TABLE parquet_columnar_access REPLACE COLUMNS (s1 string, x1 int, y1 int, f1 float) PREHOOK: type: ALTERTABLE_REPLACECOLS PREHOOK: Input: default@parquet_columnar_access PREHOOK: Output: default@parquet_columnar_access -POSTHOOK: query: ALTER TABLE parquet_columnar_access REPLACE COLUMNS (s1 string, i1 int, f1 float) +POSTHOOK: query: ALTER TABLE parquet_columnar_access REPLACE COLUMNS (s1 string, x1 int, y1 int, f1 float) POSTHOOK: type: ALTERTABLE_REPLACECOLS POSTHOOK: Input: default@parquet_columnar_access POSTHOOK: Output: default@parquet_columnar_access @@ -106,24 +109,24 @@ POSTHOOK: query: SELECT * FROM parquet_columnar_access POSTHOOK: type: QUERY POSTHOOK: Input: default@parquet_columnar_access #### A masked pattern was here #### -1abc00 1 1.0 -1def01 2 1.1 -1ghi02 3 1.2 -1jkl03 1 1.3 -1mno04 2 1.4 -1pqr05 3 1.0 -1stu06 1 1.1 -1vwx07 2 1.2 -1yza08 3 1.3 -1bcd09 1 1.4 -1efg10 2 1.0 -1hij11 3 1.1 -1klm12 1 1.2 -1nop13 2 1.3 -1qrs14 3 1.4 -1tuv15 1 1.0 -1wxy16 2 1.1 -1zab17 3 1.2 -1cde18 1 1.3 -1fgh19 2 1.4 -1ijk20 3 1.0 +1abc00 1 2 1.0 +1def01 2 3 1.1 +1ghi02 3 4 1.2 +1jkl03 1 2 1.3 +1mno04 2 3 1.4 +1pqr05 3 4 1.0 +1stu06 1 2 1.1 +1vwx07 2 3 1.2 +1yza08 3 4 1.3 +1bcd09 1 2 1.4 +1efg10 2 3 1.0 +1hij11 3 4 1.1 +1klm12 1 2 1.2 +1nop13 2 3 1.3 +1qrs14 3 4 1.4 +1tuv15 1 2 1.0 +1wxy16 2 3 1.1 +1zab17 3 4 1.2 +1cde18 1 2 1.3 +1fgh19 2 3 1.4 +1ijk20 3 4 1.0 diff --git a/ql/src/test/results/clientpositive/parquet_mixed_case.q.out b/ql/src/test/results/clientpositive/parquet_mixed_case.q.out new file mode 100644 index 0000000..cd9d560 --- /dev/null +++ b/ql/src/test/results/clientpositive/parquet_mixed_case.q.out @@ -0,0 +1,41 @@ +PREHOOK: query: DROP TABLE parquet_mixed_case +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE parquet_mixed_case +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE parquet_mixed_case ( + lowerCase string, + UPPERcase string, + stats bigint, + moreuppercase string, + MORELOWERCASE string +) STORED AS PARQUET +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: CREATE TABLE parquet_mixed_case ( + lowerCase string, + UPPERcase string, + stats bigint, + moreuppercase string, + MORELOWERCASE string +) STORED AS PARQUET +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_mixed_case +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_mixed_case' OVERWRITE INTO TABLE parquet_mixed_case +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@parquet_mixed_case +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_mixed_case' OVERWRITE INTO TABLE parquet_mixed_case +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@parquet_mixed_case +PREHOOK: query: SELECT lowercase, "|", uppercase, "|", stats, "|", moreuppercase, "|", morelowercase FROM parquet_mixed_case +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_mixed_case +#### A masked pattern was here #### +POSTHOOK: query: SELECT lowercase, "|", uppercase, "|", stats, "|", moreuppercase, "|", morelowercase FROM parquet_mixed_case +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_mixed_case +#### A masked pattern was here #### +test lowercase string | test upperCase string | NULL | more upperCase string | more lowercase string +test lowercase string2 | test upperCase string2 | NULL | more upperCase string2 | more lowercase string2