commit 9a56169f129c903e62d948d0e10c9bc44a4c8062 Author: Owen O'Malley Date: Tue Nov 4 14:49:32 2014 -0800 HIVE-8732. Fix column statistics. diff --git ql/src/gen/protobuf/gen-java/org/apache/hadoop/hive/ql/io/orc/OrcProto.java ql/src/gen/protobuf/gen-java/org/apache/hadoop/hive/ql/io/orc/OrcProto.java index 2ba2838..662e058 100644 --- ql/src/gen/protobuf/gen-java/org/apache/hadoop/hive/ql/io/orc/OrcProto.java +++ ql/src/gen/protobuf/gen-java/org/apache/hadoop/hive/ql/io/orc/OrcProto.java @@ -16333,14 +16333,32 @@ public Builder clearRowIndexStride() { // repeated uint32 version = 4 [packed = true]; /** * repeated uint32 version = 4 [packed = true]; + * + *
+     * the version of the file format
+     *   [0, 11] = Hive 0.11
+     *   [0, 12] = Hive 0.12
+     * 
*/ java.util.List getVersionList(); /** * repeated uint32 version = 4 [packed = true]; + * + *
+     * the version of the file format
+     *   [0, 11] = Hive 0.11
+     *   [0, 12] = Hive 0.12
+     * 
*/ int getVersionCount(); /** * repeated uint32 version = 4 [packed = true]; + * + *
+     * the version of the file format
+     *   [0, 11] = Hive 0.11
+     *   [0, 12] = Hive 0.12
+     * 
*/ int getVersion(int index); @@ -16354,6 +16372,28 @@ public Builder clearRowIndexStride() { */ long getMetadataLength(); + // optional uint32 writerVersion = 6; + /** + * optional uint32 writerVersion = 6; + * + *
+     * Version of the writer:
+     *   0 (or missing) = original
+     *   1 = HIVE-8732 fixed
+     * 
+ */ + boolean hasWriterVersion(); + /** + * optional uint32 writerVersion = 6; + * + *
+     * Version of the writer:
+     *   0 (or missing) = original
+     *   1 = HIVE-8732 fixed
+     * 
+ */ + int getWriterVersion(); + // optional string magic = 8000; /** * optional string magic = 8000; @@ -16483,8 +16523,13 @@ private PostScript( metadataLength_ = input.readUInt64(); break; } - case 64002: { + case 48: { bitField0_ |= 0x00000010; + writerVersion_ = input.readUInt32(); + break; + } + case 64002: { + bitField0_ |= 0x00000020; magic_ = input.readBytes(); break; } @@ -16584,6 +16629,12 @@ public long getCompressionBlockSize() { private java.util.List version_; /** * repeated uint32 version = 4 [packed = true]; + * + *
+     * the version of the file format
+     *   [0, 11] = Hive 0.11
+     *   [0, 12] = Hive 0.12
+     * 
*/ public java.util.List getVersionList() { @@ -16591,12 +16642,24 @@ public long getCompressionBlockSize() { } /** * repeated uint32 version = 4 [packed = true]; + * + *
+     * the version of the file format
+     *   [0, 11] = Hive 0.11
+     *   [0, 12] = Hive 0.12
+     * 
*/ public int getVersionCount() { return version_.size(); } /** * repeated uint32 version = 4 [packed = true]; + * + *
+     * the version of the file format
+     *   [0, 11] = Hive 0.11
+     *   [0, 12] = Hive 0.12
+     * 
*/ public int getVersion(int index) { return version_.get(index); @@ -16619,6 +16682,34 @@ public long getMetadataLength() { return metadataLength_; } + // optional uint32 writerVersion = 6; + public static final int WRITERVERSION_FIELD_NUMBER = 6; + private int writerVersion_; + /** + * optional uint32 writerVersion = 6; + * + *
+     * Version of the writer:
+     *   0 (or missing) = original
+     *   1 = HIVE-8732 fixed
+     * 
+ */ + public boolean hasWriterVersion() { + return ((bitField0_ & 0x00000010) == 0x00000010); + } + /** + * optional uint32 writerVersion = 6; + * + *
+     * Version of the writer:
+     *   0 (or missing) = original
+     *   1 = HIVE-8732 fixed
+     * 
+ */ + public int getWriterVersion() { + return writerVersion_; + } + // optional string magic = 8000; public static final int MAGIC_FIELD_NUMBER = 8000; private java.lang.Object magic_; @@ -16630,7 +16721,7 @@ public long getMetadataLength() { * */ public boolean hasMagic() { - return ((bitField0_ & 0x00000010) == 0x00000010); + return ((bitField0_ & 0x00000020) == 0x00000020); } /** * optional string magic = 8000; @@ -16680,6 +16771,7 @@ private void initFields() { compressionBlockSize_ = 0L; version_ = java.util.Collections.emptyList(); metadataLength_ = 0L; + writerVersion_ = 0; magic_ = ""; } private byte memoizedIsInitialized = -1; @@ -16714,6 +16806,9 @@ public void writeTo(com.google.protobuf.CodedOutputStream output) output.writeUInt64(5, metadataLength_); } if (((bitField0_ & 0x00000010) == 0x00000010)) { + output.writeUInt32(6, writerVersion_); + } + if (((bitField0_ & 0x00000020) == 0x00000020)) { output.writeBytes(8000, getMagicBytes()); } getUnknownFields().writeTo(output); @@ -16757,6 +16852,10 @@ public int getSerializedSize() { } if (((bitField0_ & 0x00000010) == 0x00000010)) { size += com.google.protobuf.CodedOutputStream + .computeUInt32Size(6, writerVersion_); + } + if (((bitField0_ & 0x00000020) == 0x00000020)) { + size += com.google.protobuf.CodedOutputStream .computeBytesSize(8000, getMagicBytes()); } size += getUnknownFields().getSerializedSize(); @@ -16889,8 +16988,10 @@ public Builder clear() { bitField0_ = (bitField0_ & ~0x00000008); metadataLength_ = 0L; bitField0_ = (bitField0_ & ~0x00000010); - magic_ = ""; + writerVersion_ = 0; bitField0_ = (bitField0_ & ~0x00000020); + magic_ = ""; + bitField0_ = (bitField0_ & ~0x00000040); return this; } @@ -16943,6 +17044,10 @@ public Builder clone() { if (((from_bitField0_ & 0x00000020) == 0x00000020)) { to_bitField0_ |= 0x00000010; } + result.writerVersion_ = writerVersion_; + if (((from_bitField0_ & 0x00000040) == 0x00000040)) { + to_bitField0_ |= 0x00000020; + } result.magic_ = magic_; result.bitField0_ = to_bitField0_; onBuilt(); @@ -16982,8 +17087,11 @@ public Builder mergeFrom(org.apache.hadoop.hive.ql.io.orc.OrcProto.PostScript ot if (other.hasMetadataLength()) { setMetadataLength(other.getMetadataLength()); } + if (other.hasWriterVersion()) { + setWriterVersion(other.getWriterVersion()); + } if (other.hasMagic()) { - bitField0_ |= 0x00000020; + bitField0_ |= 0x00000040; magic_ = other.magic_; onChanged(); } @@ -17126,6 +17234,12 @@ private void ensureVersionIsMutable() { } /** * repeated uint32 version = 4 [packed = true]; + * + *
+       * the version of the file format
+       *   [0, 11] = Hive 0.11
+       *   [0, 12] = Hive 0.12
+       * 
*/ public java.util.List getVersionList() { @@ -17133,18 +17247,36 @@ private void ensureVersionIsMutable() { } /** * repeated uint32 version = 4 [packed = true]; + * + *
+       * the version of the file format
+       *   [0, 11] = Hive 0.11
+       *   [0, 12] = Hive 0.12
+       * 
*/ public int getVersionCount() { return version_.size(); } /** * repeated uint32 version = 4 [packed = true]; + * + *
+       * the version of the file format
+       *   [0, 11] = Hive 0.11
+       *   [0, 12] = Hive 0.12
+       * 
*/ public int getVersion(int index) { return version_.get(index); } /** * repeated uint32 version = 4 [packed = true]; + * + *
+       * the version of the file format
+       *   [0, 11] = Hive 0.11
+       *   [0, 12] = Hive 0.12
+       * 
*/ public Builder setVersion( int index, int value) { @@ -17155,6 +17287,12 @@ public Builder setVersion( } /** * repeated uint32 version = 4 [packed = true]; + * + *
+       * the version of the file format
+       *   [0, 11] = Hive 0.11
+       *   [0, 12] = Hive 0.12
+       * 
*/ public Builder addVersion(int value) { ensureVersionIsMutable(); @@ -17164,6 +17302,12 @@ public Builder addVersion(int value) { } /** * repeated uint32 version = 4 [packed = true]; + * + *
+       * the version of the file format
+       *   [0, 11] = Hive 0.11
+       *   [0, 12] = Hive 0.12
+       * 
*/ public Builder addAllVersion( java.lang.Iterable values) { @@ -17174,6 +17318,12 @@ public Builder addAllVersion( } /** * repeated uint32 version = 4 [packed = true]; + * + *
+       * the version of the file format
+       *   [0, 11] = Hive 0.11
+       *   [0, 12] = Hive 0.12
+       * 
*/ public Builder clearVersion() { version_ = java.util.Collections.emptyList(); @@ -17215,6 +17365,63 @@ public Builder clearMetadataLength() { return this; } + // optional uint32 writerVersion = 6; + private int writerVersion_ ; + /** + * optional uint32 writerVersion = 6; + * + *
+       * Version of the writer:
+       *   0 (or missing) = original
+       *   1 = HIVE-8732 fixed
+       * 
+ */ + public boolean hasWriterVersion() { + return ((bitField0_ & 0x00000020) == 0x00000020); + } + /** + * optional uint32 writerVersion = 6; + * + *
+       * Version of the writer:
+       *   0 (or missing) = original
+       *   1 = HIVE-8732 fixed
+       * 
+ */ + public int getWriterVersion() { + return writerVersion_; + } + /** + * optional uint32 writerVersion = 6; + * + *
+       * Version of the writer:
+       *   0 (or missing) = original
+       *   1 = HIVE-8732 fixed
+       * 
+ */ + public Builder setWriterVersion(int value) { + bitField0_ |= 0x00000020; + writerVersion_ = value; + onChanged(); + return this; + } + /** + * optional uint32 writerVersion = 6; + * + *
+       * Version of the writer:
+       *   0 (or missing) = original
+       *   1 = HIVE-8732 fixed
+       * 
+ */ + public Builder clearWriterVersion() { + bitField0_ = (bitField0_ & ~0x00000020); + writerVersion_ = 0; + onChanged(); + return this; + } + // optional string magic = 8000; private java.lang.Object magic_ = ""; /** @@ -17225,7 +17432,7 @@ public Builder clearMetadataLength() { * */ public boolean hasMagic() { - return ((bitField0_ & 0x00000020) == 0x00000020); + return ((bitField0_ & 0x00000040) == 0x00000040); } /** * optional string magic = 8000; @@ -17277,7 +17484,7 @@ public Builder setMagic( if (value == null) { throw new NullPointerException(); } - bitField0_ |= 0x00000020; + bitField0_ |= 0x00000040; magic_ = value; onChanged(); return this; @@ -17290,7 +17497,7 @@ public Builder setMagic( * */ public Builder clearMagic() { - bitField0_ = (bitField0_ & ~0x00000020); + bitField0_ = (bitField0_ & ~0x00000040); magic_ = getDefaultInstance().getMagic(); onChanged(); return this; @@ -17307,7 +17514,7 @@ public Builder setMagicBytes( if (value == null) { throw new NullPointerException(); } - bitField0_ |= 0x00000020; + bitField0_ |= 0x00000040; magic_ = value; onChanged(); return this; @@ -17513,13 +17720,14 @@ public Builder setMagicBytes( "em\022\024\n\014numberOfRows\030\006 \001(\004\022F\n\nstatistics\030\007" + " \003(\01322.org.apache.hadoop.hive.ql.io.orc." + "ColumnStatistics\022\026\n\016rowIndexStride\030\010 \001(\r" + - "\"\305\001\n\nPostScript\022\024\n\014footerLength\030\001 \001(\004\022F\n" + + "\"\334\001\n\nPostScript\022\024\n\014footerLength\030\001 \001(\004\022F\n" + "\013compression\030\002 \001(\01621.org.apache.hadoop.h" + "ive.ql.io.orc.CompressionKind\022\034\n\024compres" + "sionBlockSize\030\003 \001(\004\022\023\n\007version\030\004 \003(\rB\002\020\001" + - "\022\026\n\016metadataLength\030\005 \001(\004\022\016\n\005magic\030\300> \001(\t", - "*:\n\017CompressionKind\022\010\n\004NONE\020\000\022\010\n\004ZLIB\020\001\022" + - "\n\n\006SNAPPY\020\002\022\007\n\003LZO\020\003" + "\022\026\n\016metadataLength\030\005 \001(\004\022\025\n\rwriterVersio", + "n\030\006 \001(\r\022\016\n\005magic\030\300> \001(\t*:\n\017CompressionKi" + + "nd\022\010\n\004NONE\020\000\022\010\n\004ZLIB\020\001\022\n\n\006SNAPPY\020\002\022\007\n\003LZ" + + "O\020\003" }; com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner = new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() { @@ -17651,7 +17859,7 @@ public Builder setMagicBytes( internal_static_org_apache_hadoop_hive_ql_io_orc_PostScript_fieldAccessorTable = new com.google.protobuf.GeneratedMessage.FieldAccessorTable( internal_static_org_apache_hadoop_hive_ql_io_orc_PostScript_descriptor, - new java.lang.String[] { "FooterLength", "Compression", "CompressionBlockSize", "Version", "MetadataLength", "Magic", }); + new java.lang.String[] { "FooterLength", "Compression", "CompressionBlockSize", "Version", "MetadataLength", "WriterVersion", "Magic", }); return null; } }; diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/ColumnStatisticsImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/ColumnStatisticsImpl.java index 65b5ca8..3235b0e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/ColumnStatisticsImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/ColumnStatisticsImpl.java @@ -394,7 +394,8 @@ void merge(ColumnStatisticsImpl other) { } else if (str.minimum != null) { if (minimum.compareTo(str.minimum) > 0) { minimum = new Text(str.getMinimum()); - } else if (maximum.compareTo(str.maximum) < 0) { + } + if (maximum.compareTo(str.maximum) < 0) { maximum = new Text(str.getMaximum()); } } @@ -563,7 +564,8 @@ void merge(ColumnStatisticsImpl other) { } else if (dec.minimum != null) { if (minimum.compareTo(dec.minimum) > 0) { minimum = dec.minimum; - } else if (maximum.compareTo(dec.maximum) < 0) { + } + if (maximum.compareTo(dec.maximum) < 0) { maximum = dec.maximum; } if (sum == null || dec.sum == null) { @@ -671,7 +673,8 @@ void merge(ColumnStatisticsImpl other) { } else if (dateStats.minimum != null) { if (minimum > dateStats.minimum) { minimum = dateStats.minimum; - } else if (maximum < dateStats.maximum) { + } + if (maximum < dateStats.maximum) { maximum = dateStats.maximum; } } @@ -767,7 +770,8 @@ void merge(ColumnStatisticsImpl other) { } else if (timestampStats.minimum != null) { if (minimum > timestampStats.minimum) { minimum = timestampStats.minimum; - } else if (maximum < timestampStats.maximum) { + } + if (maximum < timestampStats.maximum) { maximum = timestampStats.maximum; } } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java index 39326c9..b46937c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java @@ -97,6 +97,26 @@ public int getMinor() { } } + /** + * Records the version of the writer in terms of which bugs have been fixed. + * For bugs in the writer, but the old readers already read the new data + * correctly, bump this version instead of the Version. + */ + public static enum WriterVersion { + ORIGINAL(0), + HIVE_8732(1); // corrupted stripe/file maximum column statistics + + private final int id; + + public int getId() { + return id; + } + + private WriterVersion(int id) { + this.id = id; + } + } + public static enum EncodingStrategy { SPEED, COMPRESSION; } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index 9340ce6..e2c5c24 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -775,7 +775,9 @@ public void run() { Reader.Options options = new Reader.Options(); setIncludedColumns(options, types, context.conf, isOriginal); setSearchArgument(options, types, context.conf, isOriginal); - if (options.getSearchArgument() != null) { + // only do split pruning if HIVE-8732 has been fixed in the writer + if (options.getSearchArgument() != null && + fileMetaInfo.writerVersion != OrcFile.WriterVersion.ORIGINAL) { SearchArgument sarg = options.getSearchArgument(); List sargLeaves = sarg.getLeaves(); List stripeStats = metadata.getStripeStatistics(); diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcNewSplit.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcNewSplit.java index edf5e8e..da23544 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcNewSplit.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcNewSplit.java @@ -38,6 +38,7 @@ private boolean isOriginal; private boolean hasBase; private final List deltas = new ArrayList(); + private OrcFile.WriterVersion writerVersion; protected OrcNewSplit(){ //The FileSplit() constructor in hadoop 0.20 and 1.x is package private so can't use it. @@ -83,6 +84,7 @@ public void write(DataOutput out) throws IOException { WritableUtils.writeVInt(out, footerBuff.limit() - footerBuff.position()); out.write(footerBuff.array(), footerBuff.position(), footerBuff.limit() - footerBuff.position()); + WritableUtils.writeVInt(out, fileMetaInfo.writerVersion.getId()); } } @@ -111,9 +113,11 @@ public void readFields(DataInput in) throws IOException { int footerBuffSize = WritableUtils.readVInt(in); ByteBuffer footerBuff = ByteBuffer.allocate(footerBuffSize); in.readFully(footerBuff.array(), 0, footerBuffSize); + OrcFile.WriterVersion writerVersion = + ReaderImpl.getWriterVersion(WritableUtils.readVInt(in)); fileMetaInfo = new ReaderImpl.FileMetaInfo(compressionType, bufferSize, - metadataSize, footerBuff); + metadataSize, footerBuff, writerVersion); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java index 48160c1..84192d5 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java @@ -42,6 +42,7 @@ private boolean isOriginal; private boolean hasBase; private final List deltas = new ArrayList(); + private OrcFile.WriterVersion writerVersion; static final int BASE_FLAG = 4; static final int ORIGINAL_FLAG = 2; @@ -92,6 +93,7 @@ public void write(DataOutput out) throws IOException { WritableUtils.writeVInt(out, footerBuff.limit() - footerBuff.position()); out.write(footerBuff.array(), footerBuff.position(), footerBuff.limit() - footerBuff.position()); + WritableUtils.writeVInt(out, fileMetaInfo.writerVersion.getId()); } } @@ -120,9 +122,11 @@ public void readFields(DataInput in) throws IOException { int footerBuffSize = WritableUtils.readVInt(in); ByteBuffer footerBuff = ByteBuffer.allocate(footerBuffSize); in.readFully(footerBuff.array(), 0, footerBuffSize); + OrcFile.WriterVersion writerVersion = + ReaderImpl.getWriterVersion(WritableUtils.readVInt(in)); fileMetaInfo = new ReaderImpl.FileMetaInfo(compressionType, bufferSize, - metadataSize, footerBuff); + metadataSize, footerBuff, writerVersion); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java index df5afd1..e42b394 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java @@ -129,6 +129,11 @@ List getTypes(); /** + * Get the version of the writer of this file. + */ + OrcFile.WriterVersion getWriterVersion(); + + /** * Options for creating a RecordReader. */ public static class Options { diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java index 13f79df..2d3f8dc 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java @@ -62,6 +62,7 @@ private long deserializedSize = -1; private final Configuration conf; private final List versionList; + private final OrcFile.WriterVersion writerVersion; //serialized footer - Keeping this around for use by getFileMetaInfo() // will help avoid cpu cycles spend in deserializing at cost of increased @@ -182,6 +183,11 @@ public long getContentLength() { } @Override + public OrcFile.WriterVersion getWriterVersion() { + return null; + } + + @Override public int getRowIndexStride() { return footer.getRowIndexStride(); } @@ -309,8 +315,22 @@ static void checkOrcVersion(Log log, Path path, List version) { this.footer = rInfo.footer; this.inspector = rInfo.inspector; this.versionList = footerMetaData.versionList; + this.writerVersion = footerMetaData.writerVersion; } + /** + * Get the WriterVersion based on the ORC file postscript. + * @param writerVersion the integer writer version + * @return + */ + static OrcFile.WriterVersion getWriterVersion(int writerVersion) { + for(OrcFile.WriterVersion version: OrcFile.WriterVersion.values()) { + if (version.getId() == writerVersion) { + return version; + } + } + return OrcFile.WriterVersion.ORIGINAL; + } private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs, Path path, @@ -346,6 +366,12 @@ private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs, int footerSize = (int) ps.getFooterLength(); int metadataSize = (int) ps.getMetadataLength(); + OrcFile.WriterVersion writerVersion; + if (ps.hasWriterVersion()) { + writerVersion = getWriterVersion(ps.getWriterVersion()); + } else { + writerVersion = OrcFile.WriterVersion.ORIGINAL; + } //check compression codec switch (ps.getCompression()) { @@ -391,7 +417,8 @@ private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs, (int) ps.getCompressionBlockSize(), (int) ps.getMetadataLength(), buffer, - ps.getVersionList() + ps.getVersionList(), + writerVersion ); } @@ -451,25 +478,29 @@ private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs, final int metadataSize; final ByteBuffer footerBuffer; final List versionList; + final OrcFile.WriterVersion writerVersion; FileMetaInfo(String compressionType, int bufferSize, int metadataSize, - ByteBuffer footerBuffer) { - this(compressionType, bufferSize, metadataSize, footerBuffer, null); + ByteBuffer footerBuffer, OrcFile.WriterVersion writerVersion) { + this(compressionType, bufferSize, metadataSize, footerBuffer, null, + writerVersion); } FileMetaInfo(String compressionType, int bufferSize, int metadataSize, - ByteBuffer footerBuffer, List versionList){ + ByteBuffer footerBuffer, List versionList, + OrcFile.WriterVersion writerVersion){ this.compressionType = compressionType; this.bufferSize = bufferSize; this.metadataSize = metadataSize; this.footerBuffer = footerBuffer; this.versionList = versionList; + this.writerVersion = writerVersion; } } public FileMetaInfo getFileMetaInfo(){ return new FileMetaInfo(compressionKind.toString(), bufferSize, - metadataSize, footerByteBuffer, versionList); + metadataSize, footerByteBuffer, versionList, writerVersion); } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java index 620d4d4..9e69de6 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java @@ -2230,7 +2230,8 @@ private int writePostScript(int footerLength, int metadataLength) throws IOExcep .setMetadataLength(metadataLength) .setMagic(OrcFile.MAGIC) .addVersion(version.getMajor()) - .addVersion(version.getMinor()); + .addVersion(version.getMinor()) + .setWriterVersion(OrcFile.WriterVersion.HIVE_8732.getId()); if (compress != CompressionKind.NONE) { builder.setCompressionBlockSize(bufferSize); } diff --git ql/src/protobuf/org/apache/hadoop/hive/ql/io/orc/orc_proto.proto ql/src/protobuf/org/apache/hadoop/hive/ql/io/orc/orc_proto.proto index 31c49f1..cbfe57b 100644 --- ql/src/protobuf/org/apache/hadoop/hive/ql/io/orc/orc_proto.proto +++ ql/src/protobuf/org/apache/hadoop/hive/ql/io/orc/orc_proto.proto @@ -191,8 +191,15 @@ message PostScript { optional uint64 footerLength = 1; optional CompressionKind compression = 2; optional uint64 compressionBlockSize = 3; + // the version of the file format + // [0, 11] = Hive 0.11 + // [0, 12] = Hive 0.12 repeated uint32 version = 4 [packed = true]; optional uint64 metadataLength = 5; + // Version of the writer: + // 0 (or missing) = original + // 1 = HIVE-8732 fixed + optional uint32 writerVersion = 6; // Leave this last in the record optional string magic = 8000; } diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestColumnStatistics.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestColumnStatistics.java new file mode 100644 index 0000000..dbd38c8 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestColumnStatistics.java @@ -0,0 +1,176 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.orc; + +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.Text; +import org.junit.Test; + +import java.sql.Timestamp; + +import static junit.framework.Assert.assertEquals; + +/** + * Test ColumnStatisticsImpl for ORC. + */ +public class TestColumnStatistics { + + @Test + public void testLongMerge() throws Exception { + ObjectInspector inspector = + PrimitiveObjectInspectorFactory.javaIntObjectInspector; + + ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(inspector); + ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(inspector); + stats1.updateInteger(10); + stats1.updateInteger(10); + stats2.updateInteger(1); + stats2.updateInteger(1000); + stats1.merge(stats2); + IntegerColumnStatistics typed = (IntegerColumnStatistics) stats1; + assertEquals(1, typed.getMinimum()); + assertEquals(1000, typed.getMaximum()); + stats1.reset(); + stats1.updateInteger(-10); + stats1.updateInteger(10000); + stats1.merge(stats2); + assertEquals(-10, typed.getMinimum()); + assertEquals(10000, typed.getMaximum()); + } + + @Test + public void testDoubleMerge() throws Exception { + ObjectInspector inspector = + PrimitiveObjectInspectorFactory.javaDoubleObjectInspector; + + ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(inspector); + ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(inspector); + stats1.updateDouble(10.0); + stats1.updateDouble(100.0); + stats2.updateDouble(1.0); + stats2.updateDouble(1000.0); + stats1.merge(stats2); + DoubleColumnStatistics typed = (DoubleColumnStatistics) stats1; + assertEquals(1.0, typed.getMinimum(), 0.001); + assertEquals(1000.0, typed.getMaximum(), 0.001); + stats1.reset(); + stats1.updateDouble(-10); + stats1.updateDouble(10000); + stats1.merge(stats2); + assertEquals(-10, typed.getMinimum(), 0.001); + assertEquals(10000, typed.getMaximum(), 0.001); + } + + + @Test + public void testStringMerge() throws Exception { + ObjectInspector inspector = + PrimitiveObjectInspectorFactory.javaStringObjectInspector; + + ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(inspector); + ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(inspector); + stats1.updateString(new Text("bob")); + stats1.updateString(new Text("david")); + stats1.updateString(new Text("charles")); + stats2.updateString(new Text("anne")); + stats2.updateString(new Text("erin")); + stats1.merge(stats2); + StringColumnStatistics typed = (StringColumnStatistics) stats1; + assertEquals("anne", typed.getMinimum()); + assertEquals("erin", typed.getMaximum()); + stats1.reset(); + stats1.updateString(new Text("aaa")); + stats1.updateString(new Text("zzz")); + stats1.merge(stats2); + assertEquals("aaa", typed.getMinimum()); + assertEquals("zzz", typed.getMaximum()); + } + + @Test + public void testDateMerge() throws Exception { + ObjectInspector inspector = + PrimitiveObjectInspectorFactory.javaDateObjectInspector; + + ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(inspector); + ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(inspector); + stats1.updateDate(new DateWritable(1000)); + stats1.updateDate(new DateWritable(100)); + stats2.updateDate(new DateWritable(10)); + stats2.updateDate(new DateWritable(2000)); + stats1.merge(stats2); + DateColumnStatistics typed = (DateColumnStatistics) stats1; + assertEquals(new DateWritable(10), typed.getMinimum()); + assertEquals(new DateWritable(2000), typed.getMaximum()); + stats1.reset(); + stats1.updateDate(new DateWritable(-10)); + stats1.updateDate(new DateWritable(10000)); + stats1.merge(stats2); + assertEquals(-10, typed.getMinimum().getDays()); + assertEquals(10000, typed.getMaximum().getDays()); + } + + @Test + public void testTimestampMerge() throws Exception { + ObjectInspector inspector = + PrimitiveObjectInspectorFactory.javaTimestampObjectInspector; + + ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(inspector); + ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(inspector); + stats1.updateTimestamp(new Timestamp(10)); + stats1.updateTimestamp(new Timestamp(100)); + stats2.updateTimestamp(new Timestamp(1)); + stats2.updateTimestamp(new Timestamp(1000)); + stats1.merge(stats2); + TimestampColumnStatistics typed = (TimestampColumnStatistics) stats1; + assertEquals(1, typed.getMinimum().getTime()); + assertEquals(1000, typed.getMaximum().getTime()); + stats1.reset(); + stats1.updateTimestamp(new Timestamp(-10)); + stats1.updateTimestamp(new Timestamp(10000)); + stats1.merge(stats2); + assertEquals(-10, typed.getMinimum().getTime()); + assertEquals(10000, typed.getMaximum().getTime()); + } + + @Test + public void testDecimalMerge() throws Exception { + ObjectInspector inspector = + PrimitiveObjectInspectorFactory.javaHiveDecimalObjectInspector; + + ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(inspector); + ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(inspector); + stats1.updateDecimal(HiveDecimal.create(10)); + stats1.updateDecimal(HiveDecimal.create(100)); + stats2.updateDecimal(HiveDecimal.create(1)); + stats2.updateDecimal(HiveDecimal.create(1000)); + stats1.merge(stats2); + DecimalColumnStatistics typed = (DecimalColumnStatistics) stats1; + assertEquals(1, typed.getMinimum().longValue()); + assertEquals(1000, typed.getMaximum().longValue()); + stats1.reset(); + stats1.updateDecimal(HiveDecimal.create(-10)); + stats1.updateDecimal(HiveDecimal.create(10000)); + stats1.merge(stats2); + assertEquals(-10, typed.getMinimum().longValue()); + assertEquals(10000, typed.getMaximum().longValue()); + } +}