Index: data/files/nulls.txt =================================================================== --- data/files/nulls.txt (revision 0) +++ data/files/nulls.txt (working copyndex: ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java (revision 1466183) +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java (working copy) @@ -18,8 +18,15 @@ package org.apache.hadoop.hive.ql.io.orc; -import com.google.protobuf.ByteString; -import com.google.protobuf.CodedOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -42,14 +49,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector; import org.apache.hadoop.io.BytesWritable; -import java.io.IOException; -import java.io.OutputStream; -import java.nio.ByteBuffer; -import java.sql.Timestamp; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.TreeMap; +import com.google.protobuf.ByteString; +import com.google.protobuf.CodedOutputStream; /** * An ORC file writer. The file is divided into stripes, which is the natural @@ -732,19 +733,8 @@ int length = rows.size(); int rowIndexEntry = 0; OrcProto.RowIndex.Builder rowIndex = getRowIndex(); - // need to build the first index entry out here, to handle the case of - // not having any values. - if (buildIndex) { - while (0 == rowIndexValueCount.get(rowIndexEntry) && - rowIndexEntry < savedRowIndex.size()) { - OrcProto.RowIndexEntry.Builder base = - savedRowIndex.get(rowIndexEntry++).toBuilder(); - rowOutput.getPosition(new RowIndexPositionRecorder(base)); - rowIndex.addEntry(base.build()); - } - } // write the values translated into the dump order. - for(int i = 0; i < length; ++i) { + for(int i = 0; i <= length; ++i) { // now that we are writing out the row values, we can finalize the // row index if (buildIndex) { @@ -756,7 +746,9 @@ rowIndex.addEntry(base.build()); } } - rowOutput.write(dumpOrder[rows.get(i)]); + if (i != length) { + rowOutput.write(dumpOrder[rows.get(i)]); + } } // we need to build the rowindex before calling super, since it // writes it out. Index: ql/src/test/queries/clientpositive/orc_ends_with_nulls.q =================================================================== --- ql/src/test/queries/clientpositive/orc_ends_with_nulls.q (revision 0) +++ ql/src/test/queries/clientpositive/orc_ends_with_nulls.q (working copy) @@ -0,0 +1,17 @@ +CREATE TABLE test_orc (key STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' +OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'; + +ALTER TABLE test_orc SET SERDEPROPERTIES ('orc.row.index.stride' = '1000'); + +-- nulls.txt is a file containing a non-null string row followed by 1000 null string rows +-- this produces the effect that the number of non-null rows between the last and second +-- to last index stride are the same (there's only two index strides) + +CREATE TABLE src_null(a STRING) STORED AS TEXTFILE; +LOAD DATA LOCAL INPATH '../data/files/nulls.txt' INTO TABLE src_null; + +INSERT OVERWRITE TABLE test_orc SELECT a FROM src_null; + +SELECT * FROM test_orc LIMIT 5; Index: ql/src/test/results/clientpositive/orc_ends_with_nulls.q.out =================================================================== --- ql/src/test/results/clientpositive/orc_ends_with_nulls.q.out (revision 0) +++ ql/src/test/results/clientpositive/orc_ends_with_nulls.q.out (working copy) @@ -0,0 +1,61 @@ +PREHOOK: query: CREATE TABLE test_orc (key STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' +OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_orc (key STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' +OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_orc +PREHOOK: query: ALTER TABLE test_orc SET SERDEPROPERTIES ('orc.row.index.stride' = '1000') +PREHOOK: type: ALTERTABLE_SERDEPROPERTIES +PREHOOK: Input: default@test_orc +PREHOOK: Output: default@test_orc +POSTHOOK: query: ALTER TABLE test_orc SET SERDEPROPERTIES ('orc.row.index.stride' = '1000') +POSTHOOK: type: ALTERTABLE_SERDEPROPERTIES +POSTHOOK: Input: default@test_orc +POSTHOOK: Output: default@test_orc +PREHOOK: query: -- nulls.txt is a file containing a non-null string row followed by 1000 null string rows +-- this produces the effect that the number of non-null rows between the last and second +-- to last index stride are the same (there's only two index strides) + +CREATE TABLE src_null(a STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- nulls.txt is a file containing a non-null string row followed by 1000 null string rows +-- this produces the effect that the number of non-null rows between the last and second +-- to last index stride are the same (there's only two index strides) + +CREATE TABLE src_null(a STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@src_null +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/nulls.txt' INTO TABLE src_null +PREHOOK: type: LOAD +PREHOOK: Output: default@src_null +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/nulls.txt' INTO TABLE src_null +POSTHOOK: type: LOAD +POSTHOOK: Output: default@src_null +PREHOOK: query: INSERT OVERWRITE TABLE test_orc SELECT a FROM src_null +PREHOOK: type: QUERY +PREHOOK: Input: default@src_null +PREHOOK: Output: default@test_orc +POSTHOOK: query: INSERT OVERWRITE TABLE test_orc SELECT a FROM src_null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_null +POSTHOOK: Output: default@test_orc +POSTHOOK: Lineage: test_orc.key SIMPLE [(src_null)src_null.FieldSchema(name:a, type:string, comment:null), ] +PREHOOK: query: SELECT * FROM test_orc LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_orc +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM test_orc LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_orc +#### A masked pattern was here #### +POSTHOOK: Lineage: test_orc.key SIMPLE [(src_null)src_null.FieldSchema(name:a, type:string, comment:null), ] +1 +NULL +NULL +NULL +NULL