diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index eae281c..f94b5e9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -87,6 +87,7 @@ import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue; import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.hive.serde2.SerDeStats; @@ -2114,6 +2115,23 @@ public static TypeDescription getDesiredRowTypeDescr(Configuration conf, boolean if (schemaEvolutionTypeDescrs.size() != schemaEvolutionColumnNames.size()) { return null; } + + // Find first virtual column and clip them off. + int virtualColumnClipNum = -1; + int columnNum = 0; + for (String columnName : schemaEvolutionColumnNames) { + if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(columnName)) { + virtualColumnClipNum = columnNum; + break; + } + columnNum++; + } + if (virtualColumnClipNum != -1) { + schemaEvolutionColumnNames = + Lists.newArrayList(schemaEvolutionColumnNames.subList(0, virtualColumnClipNum)); + schemaEvolutionTypeDescrs = Lists.newArrayList(schemaEvolutionTypeDescrs.subList(0, virtualColumnClipNum)); + } + LOG.info("Using column configuration variables columns " + schemaEvolutionColumnNames.toString() + " / columns.types " + diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java index a85bfef..a127ccc 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java @@ -154,6 +154,7 @@ protected RecordReaderImpl(List stripes, TreeReaderFactory.TreeReaderSchema treeReaderSchema; if (options.getSchema() == null) { + LOG.info("Schema on read not provided -- using file schema " + types.toString()); treeReaderSchema = new TreeReaderFactory.TreeReaderSchema().fileTypes(types).schemaTypes(types); } else { @@ -901,7 +902,7 @@ private void readAllDataStreams(StripeInformation stripe) throws IOException { // since stream kind is optional, first check if it exists if (stream.hasKind() && (StreamName.getArea(streamKind) == StreamName.Area.DATA) && - includedColumns[column]) { + (column < includedColumns.length && includedColumns[column])) { // if we aren't filtering or it is a dictionary, load it. if (includedRowGroups == null || RecordReaderUtils.isDictionary(streamKind, encodings.get(column))) { @@ -926,7 +927,8 @@ void createStreams(List streamDescriptions, long streamOffset = 0; for (OrcProto.Stream streamDesc : streamDescriptions) { int column = streamDesc.getColumn(); - if ((includeColumn != null && !includeColumn[column]) || + if ((includeColumn != null && + (column < included.length && !includeColumn[column])) || streamDesc.hasKind() && (StreamName.getArea(streamDesc.getKind()) != StreamName.Area.DATA)) { streamOffset += streamDesc.getLength(); diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java index 3fe28d8..a241d68 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java @@ -2036,7 +2036,7 @@ public Object nextVector(Object previousVector, long batchSize) throws IOExcepti } protected static class StructTreeReader extends TreeReader { - private final int fileColumnCount; + private final int readColumnCount; private final int resultColumnCount; protected final TreeReader[] fields; private final String[] fieldNames; @@ -2049,30 +2049,31 @@ protected StructTreeReader( super(columnId); OrcProto.Type fileStructType = treeReaderSchema.getFileTypes().get(columnId); - fileColumnCount = fileStructType.getFieldNamesCount(); OrcProto.Type schemaStructType = treeReaderSchema.getSchemaTypes().get(columnId); + readColumnCount = Math.min(fileStructType.getFieldNamesCount(), schemaStructType.getFieldNamesCount()); + if (columnId == treeReaderSchema.getInnerStructSubtype()) { // If there are more result columns than reader columns, we will default those additional // columns to NULL. resultColumnCount = schemaStructType.getFieldNamesCount(); } else { - resultColumnCount = fileColumnCount; + resultColumnCount = readColumnCount; } - this.fields = new TreeReader[fileColumnCount]; - this.fieldNames = new String[fileColumnCount]; + this.fields = new TreeReader[readColumnCount]; + this.fieldNames = new String[readColumnCount]; if (included == null) { - for (int i = 0; i < fileColumnCount; ++i) { + for (int i = 0; i < readColumnCount; ++i) { int subtype = schemaStructType.getSubtypes(i); this.fields[i] = createTreeReader(subtype, treeReaderSchema, included, skipCorrupt); // Use the treeReaderSchema evolution name since file/reader types may not have the real column name. this.fieldNames[i] = schemaStructType.getFieldNames(i); } } else { - for (int i = 0; i < fileColumnCount; ++i) { + for (int i = 0; i < readColumnCount; ++i) { int subtype = schemaStructType.getSubtypes(i); if (subtype >= included.length) { throw new IOException("subtype " + subtype + " exceeds the included array size " + @@ -2116,13 +2117,13 @@ Object next(Object previous) throws IOException { result.setNumFields(resultColumnCount); } } - for (int i = 0; i < fileColumnCount; ++i) { + for (int i = 0; i < readColumnCount; ++i) { if (fields[i] != null) { result.setFieldValue(i, fields[i].next(result.getFieldValue(i))); } } - if (resultColumnCount > fileColumnCount) { - for (int i = fileColumnCount; i < resultColumnCount; ++i) { + if (resultColumnCount > readColumnCount) { + for (int i = readColumnCount; i < resultColumnCount; ++i) { // Default new treeReaderSchema evolution fields to NULL. result.setFieldValue(i, null); } @@ -2135,13 +2136,13 @@ Object next(Object previous) throws IOException { public Object nextVector(Object previousVector, long batchSize) throws IOException { final ColumnVector[] result; if (previousVector == null) { - result = new ColumnVector[fileColumnCount]; + result = new ColumnVector[readColumnCount]; } else { result = (ColumnVector[]) previousVector; } // Read all the members of struct as column vectors - for (int i = 0; i < fileColumnCount; i++) { + for (int i = 0; i < readColumnCount; i++) { if (fields[i] != null) { if (result[i] == null) { result[i] = (ColumnVector) fields[i].nextVector(null, batchSize); @@ -2152,8 +2153,8 @@ public Object nextVector(Object previousVector, long batchSize) throws IOExcepti } // Default additional treeReaderSchema evolution fields to NULL. - if (vectorColumnCount != -1 && vectorColumnCount > fileColumnCount) { - for (int i = fileColumnCount; i < vectorColumnCount; ++i) { + if (vectorColumnCount != -1 && vectorColumnCount > readColumnCount) { + for (int i = readColumnCount; i < vectorColumnCount; ++i) { ColumnVector colVector = result[i]; if (colVector != null) { colVector.isRepeating = true;