diff --git a/data/files/parquet_mixed_case b/data/files/parquet_mixed_case new file mode 100644 index 0000000..f6af0f5 Binary files /dev/null and b/data/files/parquet_mixed_case differ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java index c100dce..8286ab8 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java @@ -26,6 +26,7 @@ import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.util.StringUtils; +import parquet.column.ColumnDescriptor; import parquet.hadoop.api.ReadSupport; import parquet.io.api.RecordMaterializer; import parquet.schema.MessageType; @@ -46,8 +47,8 @@ private static final String TABLE_SCHEMA = "table_schema"; public static final String HIVE_SCHEMA_KEY = "HIVE_TABLE_SCHEMA"; - public static final String PARQUET_COLUMN_INDEX_ACCESS = "parquet.column.index.access"; - + public static final String PARQUET_COLUMN_INDEX_ACCESS = "parquet.column.index.access"; + /** * From a string which columns names (including hive column), return a list * of string columns @@ -75,12 +76,16 @@ final Map contextMetadata = new HashMap(); if (columns != null) { final List listColumns = getColumns(columns); - + final Map lowerCaseFileSchemaColumns = new HashMap(); + for (ColumnDescriptor c : fileSchema.getColumns()) { + lowerCaseFileSchemaColumns.put(c.getPath()[0].toLowerCase(), c.getPath()[0]); + } final List typeListTable = new ArrayList(); - for (final String col : listColumns) { + for (String col : listColumns) { + col = col.toLowerCase(); // listColumns contains partition columns which are metadata only - if (fileSchema.containsField(col)) { - typeListTable.add(fileSchema.getType(col)); + if (lowerCaseFileSchemaColumns.containsKey(col)) { + typeListTable.add(fileSchema.getType(lowerCaseFileSchemaColumns.get(col))); } else { // below allows schema evolution typeListTable.add(new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, col)); @@ -94,9 +99,17 @@ final List typeListWanted = new ArrayList(); for (final Integer idx : indexColumnsWanted) { - typeListWanted.add(tableSchema.getType(listColumns.get(idx))); + String col = listColumns.get(idx).toLowerCase(); + if (lowerCaseFileSchemaColumns.containsKey(col)) { + typeListWanted.add(tableSchema.getType(lowerCaseFileSchemaColumns.get(col))); + } else { + // should never occur? + String msg = "Column " + col + " at index " + idx + " does not exist in " + + lowerCaseFileSchemaColumns; + throw new IllegalStateException(msg); + } } - requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(), + requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(), typeListWanted), fileSchema, configuration); return new ReadContext(requestedSchemaByUser, contextMetadata); @@ -127,10 +140,9 @@ } final MessageType tableSchema = resolveSchemaAccess(MessageTypeParser. parseMessageType(metadata.get(HIVE_SCHEMA_KEY)), fileSchema, configuration); - return new DataWritableRecordConverter(readContext.getRequestedSchema(), tableSchema); } - + /** * Determine the file column names based on the position within the requested columns and * use that as the requested schema. @@ -139,17 +151,13 @@ private MessageType resolveSchemaAccess(MessageType requestedSchema, MessageType Configuration configuration) { if(configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false)) { final List listColumns = getColumns(configuration.get(IOConstants.COLUMNS)); - List requestedTypes = new ArrayList(); - for(Type t : requestedSchema.getFields()) { int index = listColumns.indexOf(t.getName()); requestedTypes.add(fileSchema.getType(index)); } - requestedSchema = new MessageType(requestedSchema.getName(), requestedTypes); } - return requestedSchema; } } diff --git a/ql/src/test/queries/clientpositive/parquet_mixed_case.q b/ql/src/test/queries/clientpositive/parquet_mixed_case.q new file mode 100644 index 0000000..95fc958 --- /dev/null +++ b/ql/src/test/queries/clientpositive/parquet_mixed_case.q @@ -0,0 +1,13 @@ +DROP TABLE parquet_mixed_case; + +CREATE TABLE parquet_mixed_case ( + lowerCase string, + UPPERcase string, + stats bigint, + moreuppercase string, + MORELOWERCASE string +) STORED AS PARQUET; + +LOAD DATA LOCAL INPATH '../../data/files/parquet_mixed_case' OVERWRITE INTO TABLE parquet_mixed_case; + +SELECT lowercase, "|", uppercase, "|", stats, "|", moreuppercase, "|", morelowercase FROM parquet_mixed_case; diff --git a/ql/src/test/results/clientpositive/parquet_mixed_case.q.out b/ql/src/test/results/clientpositive/parquet_mixed_case.q.out new file mode 100644 index 0000000..cd9d560 --- /dev/null +++ b/ql/src/test/results/clientpositive/parquet_mixed_case.q.out @@ -0,0 +1,41 @@ +PREHOOK: query: DROP TABLE parquet_mixed_case +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE parquet_mixed_case +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE parquet_mixed_case ( + lowerCase string, + UPPERcase string, + stats bigint, + moreuppercase string, + MORELOWERCASE string +) STORED AS PARQUET +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: CREATE TABLE parquet_mixed_case ( + lowerCase string, + UPPERcase string, + stats bigint, + moreuppercase string, + MORELOWERCASE string +) STORED AS PARQUET +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_mixed_case +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_mixed_case' OVERWRITE INTO TABLE parquet_mixed_case +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@parquet_mixed_case +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_mixed_case' OVERWRITE INTO TABLE parquet_mixed_case +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@parquet_mixed_case +PREHOOK: query: SELECT lowercase, "|", uppercase, "|", stats, "|", moreuppercase, "|", morelowercase FROM parquet_mixed_case +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_mixed_case +#### A masked pattern was here #### +POSTHOOK: query: SELECT lowercase, "|", uppercase, "|", stats, "|", moreuppercase, "|", morelowercase FROM parquet_mixed_case +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_mixed_case +#### A masked pattern was here #### +test lowercase string | test upperCase string | NULL | more upperCase string | more lowercase string +test lowercase string2 | test upperCase string2 | NULL | more upperCase string2 | more lowercase string2