Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java (revision 1455674) +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java (working copy) @@ -17,6 +17,15 @@ */ package org.apache.hadoop.hive.ql.io.orc; +import java.io.EOFException; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -30,15 +39,6 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; -import java.io.EOFException; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.sql.Timestamp; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - class RecordReaderImpl implements RecordReader { private final FSDataInputStream file; private final long firstRow; @@ -735,6 +735,12 @@ result = new OrcStruct(fields.length); } else { result = (OrcStruct) previous; + + // If the input format was initialized with a file with a different number + // of fields, the number of fields needs to be updated to the correct number + if (result.getNumFields() != fields.length) { + result.setNumFields(fields.length); + } } for(int i=0; i < fields.length; ++i) { if (fields[i] != null) { Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcStruct.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcStruct.java (revision 1455674) +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcStruct.java (working copy) @@ -17,6 +17,13 @@ */ package org.apache.hadoop.hive.ql.io.orc; +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -31,16 +38,9 @@ import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; import org.apache.hadoop.io.Writable; -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; - final class OrcStruct implements Writable { - private final Object[] fields; + private Object[] fields; OrcStruct(int children) { fields = new Object[children]; @@ -54,6 +54,14 @@ fields[fieldIndex] = value; } + public int getNumFields() { + return fields.length; + } + + public void setNumFields(int numFields) { + fields = new Object[numFields]; + } + @Override public void write(DataOutput dataOutput) throws IOException { throw new UnsupportedOperationException("write unsupported"); Index: ql/src/test/queries/clientpositive/orc_diff_part_cols.q =================================================================== --- ql/src/test/queries/clientpositive/orc_diff_part_cols.q (revision 0) +++ ql/src/test/queries/clientpositive/orc_diff_part_cols.q (working copy) @@ -0,0 +1,19 @@ +CREATE TABLE test_orc (key STRING) +PARTITIONED BY (part STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' +OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'; + +set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat; + +-- Create a table with one column write to a partition, then add an additional column and write +-- to another partition +-- This can produce unexpected results with CombineHiveInputFormat + +INSERT OVERWRITE TABLE test_orc PARTITION (part = '1') SELECT key FROM src LIMIT 5; + +ALTER TABLE test_orc ADD COLUMNS (cnt INT); + +INSERT OVERWRITE TABLE test_orc PARTITION (part = '2') SELECT key, count(*) FROM src GROUP BY key LIMIT 5; + +SELECT * FROM test_orc; Index: ql/src/test/queries/clientpositive/orc_empty_files.q =================================================================== --- ql/src/test/queries/clientpositive/orc_empty_files.q (revision 0) +++ ql/src/test/queries/clientpositive/orc_empty_files.q (working copy) @@ -0,0 +1,18 @@ +CREATE TABLE test_orc (key STRING, cnt INT) +CLUSTERED BY (key) INTO 3 BUCKETS +ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' +OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'; + +set hive.enforce.bucketing=true; +set hive.exec.reducers.max = 1; +set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat; + +-- Creates a table bucketed into 3 buckets, but only one contains data, specifically bucket 1, +-- buckets 0 and 2 are empty, so this tests reading from and empty file followed by a file +-- containing data and a file containing data followed by an empty file. +-- This can produce unexpected results with CombineHiveInputFormat + +INSERT OVERWRITE TABLE test_orc SELECT one, COUNT(*) FROM (SELECT 1 AS one FROM src) a GROUP BY one; + +SELECT count(*) FROM test_orc; Index: ql/src/test/results/clientpositive/orc_diff_part_cols.q.out =================================================================== --- ql/src/test/results/clientpositive/orc_diff_part_cols.q.out (revision 0) +++ ql/src/test/results/clientpositive/orc_diff_part_cols.q.out (working copy) @@ -0,0 +1,73 @@ +PREHOOK: query: CREATE TABLE test_orc (key STRING) +PARTITIONED BY (part STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' +OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_orc (key STRING) +PARTITIONED BY (part STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' +OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_orc +PREHOOK: query: -- Create a table with one column write to a partition, then add an additional column and write +-- to another partition +-- This can produce unexpected results with CombineHiveInputFormat + +INSERT OVERWRITE TABLE test_orc PARTITION (part = '1') SELECT key FROM src LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_orc@part=1 +POSTHOOK: query: -- Create a table with one column write to a partition, then add an additional column and write +-- to another partition +-- This can produce unexpected results with CombineHiveInputFormat + +INSERT OVERWRITE TABLE test_orc PARTITION (part = '1') SELECT key FROM src LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_orc@part=1 +POSTHOOK: Lineage: test_orc PARTITION(part=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: ALTER TABLE test_orc ADD COLUMNS (cnt INT) +PREHOOK: type: ALTERTABLE_ADDCOLS +PREHOOK: Input: default@test_orc +PREHOOK: Output: default@test_orc +POSTHOOK: query: ALTER TABLE test_orc ADD COLUMNS (cnt INT) +POSTHOOK: type: ALTERTABLE_ADDCOLS +POSTHOOK: Input: default@test_orc +POSTHOOK: Output: default@test_orc +POSTHOOK: Lineage: test_orc PARTITION(part=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: INSERT OVERWRITE TABLE test_orc PARTITION (part = '2') SELECT key, count(*) FROM src GROUP BY key LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_orc@part=2 +POSTHOOK: query: INSERT OVERWRITE TABLE test_orc PARTITION (part = '2') SELECT key, count(*) FROM src GROUP BY key LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_orc@part=2 +POSTHOOK: Lineage: test_orc PARTITION(part=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_orc PARTITION(part=2).cnt EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: test_orc PARTITION(part=2).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: SELECT * FROM test_orc +PREHOOK: type: QUERY +PREHOOK: Input: default@test_orc@part=1 +PREHOOK: Input: default@test_orc@part=2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM test_orc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_orc@part=1 +POSTHOOK: Input: default@test_orc@part=2 +#### A masked pattern was here #### +POSTHOOK: Lineage: test_orc PARTITION(part=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_orc PARTITION(part=2).cnt EXPRESSION [(src)src.null, ] +POSTHOOK: Lineage: test_orc PARTITION(part=2).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +238 NULL 1 +86 NULL 1 +311 NULL 1 +27 NULL 1 +165 NULL 1 +0 3 2 +10 1 2 +100 2 2 +103 2 2 +104 2 2 Index: ql/src/test/results/clientpositive/orc_empty_files.q.out =================================================================== --- ql/src/test/results/clientpositive/orc_empty_files.q.out (revision 0) +++ ql/src/test/results/clientpositive/orc_empty_files.q.out (working copy) @@ -0,0 +1,44 @@ +PREHOOK: query: CREATE TABLE test_orc (key STRING, cnt INT) +CLUSTERED BY (key) INTO 3 BUCKETS +ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' +OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_orc (key STRING, cnt INT) +CLUSTERED BY (key) INTO 3 BUCKETS +ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' +OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_orc +PREHOOK: query: -- Creates a table bucketed into 3 buckets, but only one contains data, specifically bucket 1, +-- buckets 0 and 2 are empty, so this tests reading from and empty file followed by a file +-- containing data and a file containing data followed by an empty file. +-- This can produce unexpected results with CombineHiveInputFormat + +INSERT OVERWRITE TABLE test_orc SELECT one, COUNT(*) FROM (SELECT 1 AS one FROM src) a GROUP BY one +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_orc +POSTHOOK: query: -- Creates a table bucketed into 3 buckets, but only one contains data, specifically bucket 1, +-- buckets 0 and 2 are empty, so this tests reading from and empty file followed by a file +-- containing data and a file containing data followed by an empty file. +-- This can produce unexpected results with CombineHiveInputFormat + +INSERT OVERWRITE TABLE test_orc SELECT one, COUNT(*) FROM (SELECT 1 AS one FROM src) a GROUP BY one +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_orc +POSTHOOK: Lineage: test_orc.cnt EXPRESSION [] +POSTHOOK: Lineage: test_orc.key EXPRESSION [] +PREHOOK: query: SELECT count(*) FROM test_orc +PREHOOK: type: QUERY +PREHOOK: Input: default@test_orc +#### A masked pattern was here #### +POSTHOOK: query: SELECT count(*) FROM test_orc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_orc +#### A masked pattern was here #### +POSTHOOK: Lineage: test_orc.cnt EXPRESSION [] +POSTHOOK: Lineage: test_orc.key EXPRESSION [] +1