diff --git a/orc/src/gen/protobuf-java/org/apache/orc/OrcProto.java b/orc/src/gen/protobuf-java/org/apache/orc/OrcProto.java index 24715c3..52ea5d7 100644 --- a/orc/src/gen/protobuf-java/org/apache/orc/OrcProto.java +++ b/orc/src/gen/protobuf-java/org/apache/orc/OrcProto.java @@ -18940,6 +18940,875 @@ public Builder setMagicBytes( // @@protoc_insertion_point(class_scope:orc.proto.PostScript) } + public interface FileTailOrBuilder + extends com.google.protobuf.MessageOrBuilder { + + // optional .orc.proto.PostScript postscript = 1; + /** + * optional .orc.proto.PostScript postscript = 1; + */ + boolean hasPostscript(); + /** + * optional .orc.proto.PostScript postscript = 1; + */ + org.apache.orc.OrcProto.PostScript getPostscript(); + /** + * optional .orc.proto.PostScript postscript = 1; + */ + org.apache.orc.OrcProto.PostScriptOrBuilder getPostscriptOrBuilder(); + + // optional .orc.proto.Footer footer = 2; + /** + * optional .orc.proto.Footer footer = 2; + */ + boolean hasFooter(); + /** + * optional .orc.proto.Footer footer = 2; + */ + org.apache.orc.OrcProto.Footer getFooter(); + /** + * optional .orc.proto.Footer footer = 2; + */ + org.apache.orc.OrcProto.FooterOrBuilder getFooterOrBuilder(); + + // optional uint64 fileLength = 3; + /** + * optional uint64 fileLength = 3; + */ + boolean hasFileLength(); + /** + * optional uint64 fileLength = 3; + */ + long getFileLength(); + + // optional uint64 postscriptLength = 4; + /** + * optional uint64 postscriptLength = 4; + */ + boolean hasPostscriptLength(); + /** + * optional uint64 postscriptLength = 4; + */ + long getPostscriptLength(); + } + /** + * Protobuf type {@code orc.proto.FileTail} + * + *
+   * The contents of the file tail that must be serialized.
+   * 
+ */ + public static final class FileTail extends + com.google.protobuf.GeneratedMessage + implements FileTailOrBuilder { + // Use FileTail.newBuilder() to construct. + private FileTail(com.google.protobuf.GeneratedMessage.Builder builder) { + super(builder); + this.unknownFields = builder.getUnknownFields(); + } + private FileTail(boolean noInit) { this.unknownFields = com.google.protobuf.UnknownFieldSet.getDefaultInstance(); } + + private static final FileTail defaultInstance; + public static FileTail getDefaultInstance() { + return defaultInstance; + } + + public FileTail getDefaultInstanceForType() { + return defaultInstance; + } + + private final com.google.protobuf.UnknownFieldSet unknownFields; + @java.lang.Override + public final com.google.protobuf.UnknownFieldSet + getUnknownFields() { + return this.unknownFields; + } + private FileTail( + com.google.protobuf.CodedInputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws com.google.protobuf.InvalidProtocolBufferException { + initFields(); + int mutable_bitField0_ = 0; + com.google.protobuf.UnknownFieldSet.Builder unknownFields = + com.google.protobuf.UnknownFieldSet.newBuilder(); + try { + boolean done = false; + while (!done) { + int tag = input.readTag(); + switch (tag) { + case 0: + done = true; + break; + default: { + if (!parseUnknownField(input, unknownFields, + extensionRegistry, tag)) { + done = true; + } + break; + } + case 10: { + org.apache.orc.OrcProto.PostScript.Builder subBuilder = null; + if (((bitField0_ & 0x00000001) == 0x00000001)) { + subBuilder = postscript_.toBuilder(); + } + postscript_ = input.readMessage(org.apache.orc.OrcProto.PostScript.PARSER, extensionRegistry); + if (subBuilder != null) { + subBuilder.mergeFrom(postscript_); + postscript_ = subBuilder.buildPartial(); + } + bitField0_ |= 0x00000001; + break; + } + case 18: { + org.apache.orc.OrcProto.Footer.Builder subBuilder = null; + if (((bitField0_ & 0x00000002) == 0x00000002)) { + subBuilder = footer_.toBuilder(); + } + footer_ = input.readMessage(org.apache.orc.OrcProto.Footer.PARSER, extensionRegistry); + if (subBuilder != null) { + subBuilder.mergeFrom(footer_); + footer_ = subBuilder.buildPartial(); + } + bitField0_ |= 0x00000002; + break; + } + case 24: { + bitField0_ |= 0x00000004; + fileLength_ = input.readUInt64(); + break; + } + case 32: { + bitField0_ |= 0x00000008; + postscriptLength_ = input.readUInt64(); + break; + } + } + } + } catch (com.google.protobuf.InvalidProtocolBufferException e) { + throw e.setUnfinishedMessage(this); + } catch (java.io.IOException e) { + throw new com.google.protobuf.InvalidProtocolBufferException( + e.getMessage()).setUnfinishedMessage(this); + } finally { + this.unknownFields = unknownFields.build(); + makeExtensionsImmutable(); + } + } + public static final com.google.protobuf.Descriptors.Descriptor + getDescriptor() { + return org.apache.orc.OrcProto.internal_static_orc_proto_FileTail_descriptor; + } + + protected com.google.protobuf.GeneratedMessage.FieldAccessorTable + internalGetFieldAccessorTable() { + return org.apache.orc.OrcProto.internal_static_orc_proto_FileTail_fieldAccessorTable + .ensureFieldAccessorsInitialized( + org.apache.orc.OrcProto.FileTail.class, org.apache.orc.OrcProto.FileTail.Builder.class); + } + + public static com.google.protobuf.Parser PARSER = + new com.google.protobuf.AbstractParser() { + public FileTail parsePartialFrom( + com.google.protobuf.CodedInputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws com.google.protobuf.InvalidProtocolBufferException { + return new FileTail(input, extensionRegistry); + } + }; + + @java.lang.Override + public com.google.protobuf.Parser getParserForType() { + return PARSER; + } + + private int bitField0_; + // optional .orc.proto.PostScript postscript = 1; + public static final int POSTSCRIPT_FIELD_NUMBER = 1; + private org.apache.orc.OrcProto.PostScript postscript_; + /** + * optional .orc.proto.PostScript postscript = 1; + */ + public boolean hasPostscript() { + return ((bitField0_ & 0x00000001) == 0x00000001); + } + /** + * optional .orc.proto.PostScript postscript = 1; + */ + public org.apache.orc.OrcProto.PostScript getPostscript() { + return postscript_; + } + /** + * optional .orc.proto.PostScript postscript = 1; + */ + public org.apache.orc.OrcProto.PostScriptOrBuilder getPostscriptOrBuilder() { + return postscript_; + } + + // optional .orc.proto.Footer footer = 2; + public static final int FOOTER_FIELD_NUMBER = 2; + private org.apache.orc.OrcProto.Footer footer_; + /** + * optional .orc.proto.Footer footer = 2; + */ + public boolean hasFooter() { + return ((bitField0_ & 0x00000002) == 0x00000002); + } + /** + * optional .orc.proto.Footer footer = 2; + */ + public org.apache.orc.OrcProto.Footer getFooter() { + return footer_; + } + /** + * optional .orc.proto.Footer footer = 2; + */ + public org.apache.orc.OrcProto.FooterOrBuilder getFooterOrBuilder() { + return footer_; + } + + // optional uint64 fileLength = 3; + public static final int FILELENGTH_FIELD_NUMBER = 3; + private long fileLength_; + /** + * optional uint64 fileLength = 3; + */ + public boolean hasFileLength() { + return ((bitField0_ & 0x00000004) == 0x00000004); + } + /** + * optional uint64 fileLength = 3; + */ + public long getFileLength() { + return fileLength_; + } + + // optional uint64 postscriptLength = 4; + public static final int POSTSCRIPTLENGTH_FIELD_NUMBER = 4; + private long postscriptLength_; + /** + * optional uint64 postscriptLength = 4; + */ + public boolean hasPostscriptLength() { + return ((bitField0_ & 0x00000008) == 0x00000008); + } + /** + * optional uint64 postscriptLength = 4; + */ + public long getPostscriptLength() { + return postscriptLength_; + } + + private void initFields() { + postscript_ = org.apache.orc.OrcProto.PostScript.getDefaultInstance(); + footer_ = org.apache.orc.OrcProto.Footer.getDefaultInstance(); + fileLength_ = 0L; + postscriptLength_ = 0L; + } + private byte memoizedIsInitialized = -1; + public final boolean isInitialized() { + byte isInitialized = memoizedIsInitialized; + if (isInitialized != -1) return isInitialized == 1; + + memoizedIsInitialized = 1; + return true; + } + + public void writeTo(com.google.protobuf.CodedOutputStream output) + throws java.io.IOException { + getSerializedSize(); + if (((bitField0_ & 0x00000001) == 0x00000001)) { + output.writeMessage(1, postscript_); + } + if (((bitField0_ & 0x00000002) == 0x00000002)) { + output.writeMessage(2, footer_); + } + if (((bitField0_ & 0x00000004) == 0x00000004)) { + output.writeUInt64(3, fileLength_); + } + if (((bitField0_ & 0x00000008) == 0x00000008)) { + output.writeUInt64(4, postscriptLength_); + } + getUnknownFields().writeTo(output); + } + + private int memoizedSerializedSize = -1; + public int getSerializedSize() { + int size = memoizedSerializedSize; + if (size != -1) return size; + + size = 0; + if (((bitField0_ & 0x00000001) == 0x00000001)) { + size += com.google.protobuf.CodedOutputStream + .computeMessageSize(1, postscript_); + } + if (((bitField0_ & 0x00000002) == 0x00000002)) { + size += com.google.protobuf.CodedOutputStream + .computeMessageSize(2, footer_); + } + if (((bitField0_ & 0x00000004) == 0x00000004)) { + size += com.google.protobuf.CodedOutputStream + .computeUInt64Size(3, fileLength_); + } + if (((bitField0_ & 0x00000008) == 0x00000008)) { + size += com.google.protobuf.CodedOutputStream + .computeUInt64Size(4, postscriptLength_); + } + size += getUnknownFields().getSerializedSize(); + memoizedSerializedSize = size; + return size; + } + + private static final long serialVersionUID = 0L; + @java.lang.Override + protected java.lang.Object writeReplace() + throws java.io.ObjectStreamException { + return super.writeReplace(); + } + + public static org.apache.orc.OrcProto.FileTail parseFrom( + com.google.protobuf.ByteString data) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data); + } + public static org.apache.orc.OrcProto.FileTail parseFrom( + com.google.protobuf.ByteString data, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data, extensionRegistry); + } + public static org.apache.orc.OrcProto.FileTail parseFrom(byte[] data) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data); + } + public static org.apache.orc.OrcProto.FileTail parseFrom( + byte[] data, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data, extensionRegistry); + } + public static org.apache.orc.OrcProto.FileTail parseFrom(java.io.InputStream input) + throws java.io.IOException { + return PARSER.parseFrom(input); + } + public static org.apache.orc.OrcProto.FileTail parseFrom( + java.io.InputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + return PARSER.parseFrom(input, extensionRegistry); + } + public static org.apache.orc.OrcProto.FileTail parseDelimitedFrom(java.io.InputStream input) + throws java.io.IOException { + return PARSER.parseDelimitedFrom(input); + } + public static org.apache.orc.OrcProto.FileTail parseDelimitedFrom( + java.io.InputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + return PARSER.parseDelimitedFrom(input, extensionRegistry); + } + public static org.apache.orc.OrcProto.FileTail parseFrom( + com.google.protobuf.CodedInputStream input) + throws java.io.IOException { + return PARSER.parseFrom(input); + } + public static org.apache.orc.OrcProto.FileTail parseFrom( + com.google.protobuf.CodedInputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + return PARSER.parseFrom(input, extensionRegistry); + } + + public static Builder newBuilder() { return Builder.create(); } + public Builder newBuilderForType() { return newBuilder(); } + public static Builder newBuilder(org.apache.orc.OrcProto.FileTail prototype) { + return newBuilder().mergeFrom(prototype); + } + public Builder toBuilder() { return newBuilder(this); } + + @java.lang.Override + protected Builder newBuilderForType( + com.google.protobuf.GeneratedMessage.BuilderParent parent) { + Builder builder = new Builder(parent); + return builder; + } + /** + * Protobuf type {@code orc.proto.FileTail} + * + *
+     * The contents of the file tail that must be serialized.
+     * 
+ */ + public static final class Builder extends + com.google.protobuf.GeneratedMessage.Builder + implements org.apache.orc.OrcProto.FileTailOrBuilder { + public static final com.google.protobuf.Descriptors.Descriptor + getDescriptor() { + return org.apache.orc.OrcProto.internal_static_orc_proto_FileTail_descriptor; + } + + protected com.google.protobuf.GeneratedMessage.FieldAccessorTable + internalGetFieldAccessorTable() { + return org.apache.orc.OrcProto.internal_static_orc_proto_FileTail_fieldAccessorTable + .ensureFieldAccessorsInitialized( + org.apache.orc.OrcProto.FileTail.class, org.apache.orc.OrcProto.FileTail.Builder.class); + } + + // Construct using org.apache.orc.OrcProto.FileTail.newBuilder() + private Builder() { + maybeForceBuilderInitialization(); + } + + private Builder( + com.google.protobuf.GeneratedMessage.BuilderParent parent) { + super(parent); + maybeForceBuilderInitialization(); + } + private void maybeForceBuilderInitialization() { + if (com.google.protobuf.GeneratedMessage.alwaysUseFieldBuilders) { + getPostscriptFieldBuilder(); + getFooterFieldBuilder(); + } + } + private static Builder create() { + return new Builder(); + } + + public Builder clear() { + super.clear(); + if (postscriptBuilder_ == null) { + postscript_ = org.apache.orc.OrcProto.PostScript.getDefaultInstance(); + } else { + postscriptBuilder_.clear(); + } + bitField0_ = (bitField0_ & ~0x00000001); + if (footerBuilder_ == null) { + footer_ = org.apache.orc.OrcProto.Footer.getDefaultInstance(); + } else { + footerBuilder_.clear(); + } + bitField0_ = (bitField0_ & ~0x00000002); + fileLength_ = 0L; + bitField0_ = (bitField0_ & ~0x00000004); + postscriptLength_ = 0L; + bitField0_ = (bitField0_ & ~0x00000008); + return this; + } + + public Builder clone() { + return create().mergeFrom(buildPartial()); + } + + public com.google.protobuf.Descriptors.Descriptor + getDescriptorForType() { + return org.apache.orc.OrcProto.internal_static_orc_proto_FileTail_descriptor; + } + + public org.apache.orc.OrcProto.FileTail getDefaultInstanceForType() { + return org.apache.orc.OrcProto.FileTail.getDefaultInstance(); + } + + public org.apache.orc.OrcProto.FileTail build() { + org.apache.orc.OrcProto.FileTail result = buildPartial(); + if (!result.isInitialized()) { + throw newUninitializedMessageException(result); + } + return result; + } + + public org.apache.orc.OrcProto.FileTail buildPartial() { + org.apache.orc.OrcProto.FileTail result = new org.apache.orc.OrcProto.FileTail(this); + int from_bitField0_ = bitField0_; + int to_bitField0_ = 0; + if (((from_bitField0_ & 0x00000001) == 0x00000001)) { + to_bitField0_ |= 0x00000001; + } + if (postscriptBuilder_ == null) { + result.postscript_ = postscript_; + } else { + result.postscript_ = postscriptBuilder_.build(); + } + if (((from_bitField0_ & 0x00000002) == 0x00000002)) { + to_bitField0_ |= 0x00000002; + } + if (footerBuilder_ == null) { + result.footer_ = footer_; + } else { + result.footer_ = footerBuilder_.build(); + } + if (((from_bitField0_ & 0x00000004) == 0x00000004)) { + to_bitField0_ |= 0x00000004; + } + result.fileLength_ = fileLength_; + if (((from_bitField0_ & 0x00000008) == 0x00000008)) { + to_bitField0_ |= 0x00000008; + } + result.postscriptLength_ = postscriptLength_; + result.bitField0_ = to_bitField0_; + onBuilt(); + return result; + } + + public Builder mergeFrom(com.google.protobuf.Message other) { + if (other instanceof org.apache.orc.OrcProto.FileTail) { + return mergeFrom((org.apache.orc.OrcProto.FileTail)other); + } else { + super.mergeFrom(other); + return this; + } + } + + public Builder mergeFrom(org.apache.orc.OrcProto.FileTail other) { + if (other == org.apache.orc.OrcProto.FileTail.getDefaultInstance()) return this; + if (other.hasPostscript()) { + mergePostscript(other.getPostscript()); + } + if (other.hasFooter()) { + mergeFooter(other.getFooter()); + } + if (other.hasFileLength()) { + setFileLength(other.getFileLength()); + } + if (other.hasPostscriptLength()) { + setPostscriptLength(other.getPostscriptLength()); + } + this.mergeUnknownFields(other.getUnknownFields()); + return this; + } + + public final boolean isInitialized() { + return true; + } + + public Builder mergeFrom( + com.google.protobuf.CodedInputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + org.apache.orc.OrcProto.FileTail parsedMessage = null; + try { + parsedMessage = PARSER.parsePartialFrom(input, extensionRegistry); + } catch (com.google.protobuf.InvalidProtocolBufferException e) { + parsedMessage = (org.apache.orc.OrcProto.FileTail) e.getUnfinishedMessage(); + throw e; + } finally { + if (parsedMessage != null) { + mergeFrom(parsedMessage); + } + } + return this; + } + private int bitField0_; + + // optional .orc.proto.PostScript postscript = 1; + private org.apache.orc.OrcProto.PostScript postscript_ = org.apache.orc.OrcProto.PostScript.getDefaultInstance(); + private com.google.protobuf.SingleFieldBuilder< + org.apache.orc.OrcProto.PostScript, org.apache.orc.OrcProto.PostScript.Builder, org.apache.orc.OrcProto.PostScriptOrBuilder> postscriptBuilder_; + /** + * optional .orc.proto.PostScript postscript = 1; + */ + public boolean hasPostscript() { + return ((bitField0_ & 0x00000001) == 0x00000001); + } + /** + * optional .orc.proto.PostScript postscript = 1; + */ + public org.apache.orc.OrcProto.PostScript getPostscript() { + if (postscriptBuilder_ == null) { + return postscript_; + } else { + return postscriptBuilder_.getMessage(); + } + } + /** + * optional .orc.proto.PostScript postscript = 1; + */ + public Builder setPostscript(org.apache.orc.OrcProto.PostScript value) { + if (postscriptBuilder_ == null) { + if (value == null) { + throw new NullPointerException(); + } + postscript_ = value; + onChanged(); + } else { + postscriptBuilder_.setMessage(value); + } + bitField0_ |= 0x00000001; + return this; + } + /** + * optional .orc.proto.PostScript postscript = 1; + */ + public Builder setPostscript( + org.apache.orc.OrcProto.PostScript.Builder builderForValue) { + if (postscriptBuilder_ == null) { + postscript_ = builderForValue.build(); + onChanged(); + } else { + postscriptBuilder_.setMessage(builderForValue.build()); + } + bitField0_ |= 0x00000001; + return this; + } + /** + * optional .orc.proto.PostScript postscript = 1; + */ + public Builder mergePostscript(org.apache.orc.OrcProto.PostScript value) { + if (postscriptBuilder_ == null) { + if (((bitField0_ & 0x00000001) == 0x00000001) && + postscript_ != org.apache.orc.OrcProto.PostScript.getDefaultInstance()) { + postscript_ = + org.apache.orc.OrcProto.PostScript.newBuilder(postscript_).mergeFrom(value).buildPartial(); + } else { + postscript_ = value; + } + onChanged(); + } else { + postscriptBuilder_.mergeFrom(value); + } + bitField0_ |= 0x00000001; + return this; + } + /** + * optional .orc.proto.PostScript postscript = 1; + */ + public Builder clearPostscript() { + if (postscriptBuilder_ == null) { + postscript_ = org.apache.orc.OrcProto.PostScript.getDefaultInstance(); + onChanged(); + } else { + postscriptBuilder_.clear(); + } + bitField0_ = (bitField0_ & ~0x00000001); + return this; + } + /** + * optional .orc.proto.PostScript postscript = 1; + */ + public org.apache.orc.OrcProto.PostScript.Builder getPostscriptBuilder() { + bitField0_ |= 0x00000001; + onChanged(); + return getPostscriptFieldBuilder().getBuilder(); + } + /** + * optional .orc.proto.PostScript postscript = 1; + */ + public org.apache.orc.OrcProto.PostScriptOrBuilder getPostscriptOrBuilder() { + if (postscriptBuilder_ != null) { + return postscriptBuilder_.getMessageOrBuilder(); + } else { + return postscript_; + } + } + /** + * optional .orc.proto.PostScript postscript = 1; + */ + private com.google.protobuf.SingleFieldBuilder< + org.apache.orc.OrcProto.PostScript, org.apache.orc.OrcProto.PostScript.Builder, org.apache.orc.OrcProto.PostScriptOrBuilder> + getPostscriptFieldBuilder() { + if (postscriptBuilder_ == null) { + postscriptBuilder_ = new com.google.protobuf.SingleFieldBuilder< + org.apache.orc.OrcProto.PostScript, org.apache.orc.OrcProto.PostScript.Builder, org.apache.orc.OrcProto.PostScriptOrBuilder>( + postscript_, + getParentForChildren(), + isClean()); + postscript_ = null; + } + return postscriptBuilder_; + } + + // optional .orc.proto.Footer footer = 2; + private org.apache.orc.OrcProto.Footer footer_ = org.apache.orc.OrcProto.Footer.getDefaultInstance(); + private com.google.protobuf.SingleFieldBuilder< + org.apache.orc.OrcProto.Footer, org.apache.orc.OrcProto.Footer.Builder, org.apache.orc.OrcProto.FooterOrBuilder> footerBuilder_; + /** + * optional .orc.proto.Footer footer = 2; + */ + public boolean hasFooter() { + return ((bitField0_ & 0x00000002) == 0x00000002); + } + /** + * optional .orc.proto.Footer footer = 2; + */ + public org.apache.orc.OrcProto.Footer getFooter() { + if (footerBuilder_ == null) { + return footer_; + } else { + return footerBuilder_.getMessage(); + } + } + /** + * optional .orc.proto.Footer footer = 2; + */ + public Builder setFooter(org.apache.orc.OrcProto.Footer value) { + if (footerBuilder_ == null) { + if (value == null) { + throw new NullPointerException(); + } + footer_ = value; + onChanged(); + } else { + footerBuilder_.setMessage(value); + } + bitField0_ |= 0x00000002; + return this; + } + /** + * optional .orc.proto.Footer footer = 2; + */ + public Builder setFooter( + org.apache.orc.OrcProto.Footer.Builder builderForValue) { + if (footerBuilder_ == null) { + footer_ = builderForValue.build(); + onChanged(); + } else { + footerBuilder_.setMessage(builderForValue.build()); + } + bitField0_ |= 0x00000002; + return this; + } + /** + * optional .orc.proto.Footer footer = 2; + */ + public Builder mergeFooter(org.apache.orc.OrcProto.Footer value) { + if (footerBuilder_ == null) { + if (((bitField0_ & 0x00000002) == 0x00000002) && + footer_ != org.apache.orc.OrcProto.Footer.getDefaultInstance()) { + footer_ = + org.apache.orc.OrcProto.Footer.newBuilder(footer_).mergeFrom(value).buildPartial(); + } else { + footer_ = value; + } + onChanged(); + } else { + footerBuilder_.mergeFrom(value); + } + bitField0_ |= 0x00000002; + return this; + } + /** + * optional .orc.proto.Footer footer = 2; + */ + public Builder clearFooter() { + if (footerBuilder_ == null) { + footer_ = org.apache.orc.OrcProto.Footer.getDefaultInstance(); + onChanged(); + } else { + footerBuilder_.clear(); + } + bitField0_ = (bitField0_ & ~0x00000002); + return this; + } + /** + * optional .orc.proto.Footer footer = 2; + */ + public org.apache.orc.OrcProto.Footer.Builder getFooterBuilder() { + bitField0_ |= 0x00000002; + onChanged(); + return getFooterFieldBuilder().getBuilder(); + } + /** + * optional .orc.proto.Footer footer = 2; + */ + public org.apache.orc.OrcProto.FooterOrBuilder getFooterOrBuilder() { + if (footerBuilder_ != null) { + return footerBuilder_.getMessageOrBuilder(); + } else { + return footer_; + } + } + /** + * optional .orc.proto.Footer footer = 2; + */ + private com.google.protobuf.SingleFieldBuilder< + org.apache.orc.OrcProto.Footer, org.apache.orc.OrcProto.Footer.Builder, org.apache.orc.OrcProto.FooterOrBuilder> + getFooterFieldBuilder() { + if (footerBuilder_ == null) { + footerBuilder_ = new com.google.protobuf.SingleFieldBuilder< + org.apache.orc.OrcProto.Footer, org.apache.orc.OrcProto.Footer.Builder, org.apache.orc.OrcProto.FooterOrBuilder>( + footer_, + getParentForChildren(), + isClean()); + footer_ = null; + } + return footerBuilder_; + } + + // optional uint64 fileLength = 3; + private long fileLength_ ; + /** + * optional uint64 fileLength = 3; + */ + public boolean hasFileLength() { + return ((bitField0_ & 0x00000004) == 0x00000004); + } + /** + * optional uint64 fileLength = 3; + */ + public long getFileLength() { + return fileLength_; + } + /** + * optional uint64 fileLength = 3; + */ + public Builder setFileLength(long value) { + bitField0_ |= 0x00000004; + fileLength_ = value; + onChanged(); + return this; + } + /** + * optional uint64 fileLength = 3; + */ + public Builder clearFileLength() { + bitField0_ = (bitField0_ & ~0x00000004); + fileLength_ = 0L; + onChanged(); + return this; + } + + // optional uint64 postscriptLength = 4; + private long postscriptLength_ ; + /** + * optional uint64 postscriptLength = 4; + */ + public boolean hasPostscriptLength() { + return ((bitField0_ & 0x00000008) == 0x00000008); + } + /** + * optional uint64 postscriptLength = 4; + */ + public long getPostscriptLength() { + return postscriptLength_; + } + /** + * optional uint64 postscriptLength = 4; + */ + public Builder setPostscriptLength(long value) { + bitField0_ |= 0x00000008; + postscriptLength_ = value; + onChanged(); + return this; + } + /** + * optional uint64 postscriptLength = 4; + */ + public Builder clearPostscriptLength() { + bitField0_ = (bitField0_ & ~0x00000008); + postscriptLength_ = 0L; + onChanged(); + return this; + } + + // @@protoc_insertion_point(builder_scope:orc.proto.FileTail) + } + + static { + defaultInstance = new FileTail(true); + defaultInstance.initFields(); + } + + // @@protoc_insertion_point(class_scope:orc.proto.FileTail) + } + private static com.google.protobuf.Descriptors.Descriptor internal_static_orc_proto_IntegerStatistics_descriptor; private static @@ -19055,6 +19924,11 @@ public Builder setMagicBytes( private static com.google.protobuf.GeneratedMessage.FieldAccessorTable internal_static_orc_proto_PostScript_fieldAccessorTable; + private static com.google.protobuf.Descriptors.Descriptor + internal_static_orc_proto_FileTail_descriptor; + private static + com.google.protobuf.GeneratedMessage.FieldAccessorTable + internal_static_orc_proto_FileTail_fieldAccessorTable; public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() { @@ -19135,9 +20009,13 @@ public Builder setMagicBytes( "ression\030\002 \001(\0162\032.orc.proto.CompressionKin" + "d\022\034\n\024compressionBlockSize\030\003 \001(\004\022\023\n\007versi", "on\030\004 \003(\rB\002\020\001\022\026\n\016metadataLength\030\005 \001(\004\022\025\n\r" + - "writerVersion\030\006 \001(\r\022\016\n\005magic\030\300> \001(\t*:\n\017C" + - "ompressionKind\022\010\n\004NONE\020\000\022\010\n\004ZLIB\020\001\022\n\n\006SN" + - "APPY\020\002\022\007\n\003LZO\020\003B\020\n\016org.apache.orc" + "writerVersion\030\006 \001(\r\022\016\n\005magic\030\300> \001(\t\"\206\001\n\010" + + "FileTail\022)\n\npostscript\030\001 \001(\0132\025.orc.proto" + + ".PostScript\022!\n\006footer\030\002 \001(\0132\021.orc.proto." + + "Footer\022\022\n\nfileLength\030\003 \001(\004\022\030\n\020postscript" + + "Length\030\004 \001(\004*:\n\017CompressionKind\022\010\n\004NONE\020" + + "\000\022\010\n\004ZLIB\020\001\022\n\n\006SNAPPY\020\002\022\007\n\003LZO\020\003B\020\n\016org." + + "apache.orc" }; com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner = new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() { @@ -19282,6 +20160,12 @@ public Builder setMagicBytes( com.google.protobuf.GeneratedMessage.FieldAccessorTable( internal_static_orc_proto_PostScript_descriptor, new java.lang.String[] { "FooterLength", "Compression", "CompressionBlockSize", "Version", "MetadataLength", "WriterVersion", "Magic", }); + internal_static_orc_proto_FileTail_descriptor = + getDescriptor().getMessageTypes().get(23); + internal_static_orc_proto_FileTail_fieldAccessorTable = new + com.google.protobuf.GeneratedMessage.FieldAccessorTable( + internal_static_orc_proto_FileTail_descriptor, + new java.lang.String[] { "Postscript", "Footer", "FileLength", "PostscriptLength", }); return null; } }; diff --git a/orc/src/java/org/apache/orc/FileMetaInfo.java b/orc/src/java/org/apache/orc/FileMetaInfo.java deleted file mode 100644 index d3cac3b..0000000 --- a/orc/src/java/org/apache/orc/FileMetaInfo.java +++ /dev/null @@ -1,64 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.orc; - -import java.nio.ByteBuffer; -import java.util.List; - -/** - * FileMetaInfo - represents file metadata stored in footer and postscript sections of the file - * that is useful for Reader implementation - * - */ -public class FileMetaInfo { - public ByteBuffer footerMetaAndPsBuffer; - public final String compressionType; - public final int bufferSize; - public final int metadataSize; - public final ByteBuffer footerBuffer; - public final List versionList; - public final OrcFile.WriterVersion writerVersion; - - - /** Ctor used when reading splits - no version list or full footer buffer. */ - public FileMetaInfo(String compressionType, int bufferSize, int metadataSize, - ByteBuffer footerBuffer, OrcFile.WriterVersion writerVersion) { - this(compressionType, bufferSize, metadataSize, footerBuffer, null, - writerVersion, null); - } - - /** Ctor used when creating file info during init and when getting a new one. */ - public FileMetaInfo(String compressionType, int bufferSize, int metadataSize, - ByteBuffer footerBuffer, List versionList, - OrcFile.WriterVersion writerVersion, - ByteBuffer fullFooterBuffer) { - this.compressionType = compressionType; - this.bufferSize = bufferSize; - this.metadataSize = metadataSize; - this.footerBuffer = footerBuffer; - this.versionList = versionList; - this.writerVersion = writerVersion; - this.footerMetaAndPsBuffer = fullFooterBuffer; - } - - public OrcFile.WriterVersion getWriterVersion() { - return writerVersion; - } - -} \ No newline at end of file diff --git a/orc/src/java/org/apache/orc/FileTail.java b/orc/src/java/org/apache/orc/FileTail.java new file mode 100644 index 0000000..afd419e --- /dev/null +++ b/orc/src/java/org/apache/orc/FileTail.java @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import org.apache.hadoop.hive.common.io.DiskRange; +import org.apache.orc.impl.BufferChunk; +import org.apache.orc.impl.InStream; +import org.apache.orc.impl.WriterImpl; + +import com.google.common.collect.Lists; + +/** + * FileMetaInfo - represents file metadata stored in footer and postscript sections of the file + * that is useful for Reader implementation + * + */ +public class FileTail { + private OrcProto.FileTail fileTailProto; + private long compressionBufferSize; + private OrcProto.CompressionKind compressionKind; + + FileTail(OrcProto.FileTail fileTail) { + this.fileTailProto = fileTail; + this.compressionBufferSize = fileTail.getPostscript().getCompressionBlockSize(); + this.compressionKind = fileTail.getPostscript().getCompression(); + } + + FileTail(ByteBuffer buffer) throws IOException { + this.fileTailProto = OrcProto.FileTail.parseFrom(InStream.createCodedInputStream("filetail", + Lists.newArrayList(new BufferChunk(buffer, 0)), buffer.limit(), + WriterImpl.createCodec(CompressionKind.valueOf(compressionKind.name())), + (int) compressionBufferSize)); + } + + public long getCompressionBufferSize() { + return compressionBufferSize; + } +} \ No newline at end of file diff --git a/orc/src/java/org/apache/orc/OrcFile.java b/orc/src/java/org/apache/orc/OrcFile.java index 7dd7333..0a4615c 100644 --- a/orc/src/java/org/apache/orc/OrcFile.java +++ b/orc/src/java/org/apache/orc/OrcFile.java @@ -25,6 +25,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.orc.impl.MemoryManager; +import org.apache.orc.impl.OrcTail; import org.apache.orc.impl.ReaderImpl; import org.apache.orc.impl.WriterImpl; @@ -160,19 +161,14 @@ protected OrcFile() {} public static class ReaderOptions { private final Configuration conf; private FileSystem filesystem; - private FileMetaInfo fileMetaInfo; // TODO: this comes from some place. private long maxLength = Long.MAX_VALUE; - private FileMetadata fullFileMetadata; // Propagate from LLAP cache. + private OrcTail orcTail; + private FileMetadata fileMetadata; public ReaderOptions(Configuration conf) { this.conf = conf; } - public ReaderOptions fileMetaInfo(FileMetaInfo info) { - fileMetaInfo = info; - return this; - } - public ReaderOptions filesystem(FileSystem fs) { this.filesystem = fs; return this; @@ -183,8 +179,8 @@ public ReaderOptions maxLength(long val) { return this; } - public ReaderOptions fileMetadata(FileMetadata metadata) { - this.fullFileMetadata = metadata; + public ReaderOptions orcTail(OrcTail tail) { + this.orcTail = tail; return this; } @@ -196,16 +192,21 @@ public FileSystem getFilesystem() { return filesystem; } - public FileMetaInfo getFileMetaInfo() { - return fileMetaInfo; - } - public long getMaxLength() { return maxLength; } + public OrcTail getOrcTail() { + return orcTail; + } + + public ReaderOptions fileMetadata(final FileMetadata metadata) { + fileMetadata = metadata; + return this; + } + public FileMetadata getFileMetadata() { - return fullFileMetadata; + return fileMetadata; } } diff --git a/orc/src/java/org/apache/orc/OrcUtils.java b/orc/src/java/org/apache/orc/OrcUtils.java index 5845ba6..dc83b9c 100644 --- a/orc/src/java/org/apache/orc/OrcUtils.java +++ b/orc/src/java/org/apache/orc/OrcUtils.java @@ -21,6 +21,8 @@ import java.util.Arrays; import java.util.List; +import org.apache.orc.impl.ReaderImpl; + import com.google.common.collect.Lists; public class OrcUtils { @@ -527,4 +529,13 @@ TypeDescription convertTypeFromProtobuf(List types, } throw new IllegalArgumentException("Unknown ORC type " + type.getKind()); } + + public static List convertProtoStripesToStripes( + List stripes) { + List result = new ArrayList(stripes.size()); + for (OrcProto.StripeInformation info : stripes) { + result.add(new ReaderImpl.StripeInformationImpl(info)); + } + return result; + } } diff --git a/orc/src/java/org/apache/orc/Reader.java b/orc/src/java/org/apache/orc/Reader.java index 87f3293..c2d5235 100644 --- a/orc/src/java/org/apache/orc/Reader.java +++ b/orc/src/java/org/apache/orc/Reader.java @@ -138,6 +138,13 @@ OrcFile.WriterVersion getWriterVersion(); /** + * Get the file tail (footer + postscript) + * + * @return - file tail + */ + OrcProto.FileTail getFileTail(); + + /** * Options for creating a RecordReader. */ public static class Options { @@ -354,7 +361,7 @@ public String toString() { /** * @return Stripe statistics. */ - List getStripeStatistics(); + List getStripeStatistics() throws IOException; /** * @return File statistics, in original protobuf form. diff --git a/orc/src/java/org/apache/orc/impl/OrcTail.java b/orc/src/java/org/apache/orc/impl/OrcTail.java new file mode 100644 index 0000000..c7187f3 --- /dev/null +++ b/orc/src/java/org/apache/orc/impl/OrcTail.java @@ -0,0 +1,150 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl; + +import static org.apache.orc.impl.ReaderImpl.extractMetadata; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; + +import org.apache.orc.CompressionCodec; +import org.apache.orc.CompressionKind; +import org.apache.orc.OrcFile; +import org.apache.orc.OrcProto; +import org.apache.orc.OrcUtils; +import org.apache.orc.StripeInformation; +import org.apache.orc.StripeStatistics; +import org.apache.orc.TypeDescription; + +public final class OrcTail { + // postscript + footer - Serialized in OrcSplit + private final OrcProto.FileTail fileTail; + // serialized representation of metadata, footer and postscript + private final ByteBuffer serializedTail; + // used to invalidate cache entries + private final long fileModificationTime; + private OrcProto.Metadata metadata; + + public OrcTail(OrcProto.FileTail fileTail, ByteBuffer serializedTail) { + this(fileTail, serializedTail, -1); + } + + public OrcTail(OrcProto.FileTail fileTail, ByteBuffer serializedTail, long fileModificationTime) { + this.fileTail = fileTail; + this.serializedTail = serializedTail; + this.fileModificationTime = fileModificationTime; + this.metadata = null; // lazily deserialized + } + + public ByteBuffer getSerializedTail() { + return serializedTail; + } + + public long getFileModificationTime() { + return fileModificationTime; + } + + public OrcProto.Footer getFooter() { + return fileTail.getFooter(); + } + + public OrcProto.PostScript getPostScript() { + return fileTail.getPostscript(); + } + + public int getPostScriptLength() { + return (int) fileTail.getPostscriptLength(); + } + + public long getFileLength() { + return fileTail.getFileLength(); + } + + public OrcFile.WriterVersion getWriterVersion() { + OrcProto.PostScript ps = fileTail.getPostscript(); + return (ps.hasWriterVersion() + ? OrcFile.WriterVersion.from(ps.getWriterVersion()) : OrcFile.WriterVersion.ORIGINAL); + } + + public List getStripes() { + List result = new ArrayList<>(); + for (OrcProto.StripeInformation stripeProto : fileTail.getFooter().getStripesList()) { + result.add(new ReaderImpl.StripeInformationImpl(stripeProto)); + } + return result; + } + + public CompressionKind getCompressionKind() { + return CompressionKind.valueOf(fileTail.getPostscript().getCompression().name()); + } + + public CompressionCodec getCompressionCodec() { + return WriterImpl.createCodec(getCompressionKind()); + } + + public int getCompressionBufferSize() { + return (int) fileTail.getPostscript().getCompressionBlockSize(); + } + + public List getStripeStatistics() throws IOException { + List result = new ArrayList<>(); + List ssProto = getStripeStatisticsProto(); + if (ssProto != null) { + for (OrcProto.StripeStatistics ss : ssProto) { + result.add(new StripeStatistics(ss.getColStatsList())); + } + } + return result; + } + + public List getStripeStatisticsProto() throws IOException { + if (serializedTail == null) return null; + if (metadata == null) { + metadata = extractMetadata(serializedTail, 0, + (int) fileTail.getPostscript().getMetadataLength(), + getCompressionCodec(), getCompressionBufferSize()); + } + return metadata.getStripeStatsList(); + } + + public int getMetadataSize() { + return (int) getPostScript().getMetadataLength(); + } + + public List getTypes() { + return getFooter().getTypesList(); + } + + public TypeDescription getTypeDescription() { + return OrcUtils.convertTypeFromProtobuf(getFooter().getTypesList(), 0); + } + + public OrcProto.FileTail getFileTail() { + return fileTail; + } + + public OrcProto.FileTail getMinimalFileTail() { + OrcProto.FileTail.Builder fileTailBuilder = OrcProto.FileTail.newBuilder(fileTail); + OrcProto.Footer.Builder footerBuilder = OrcProto.Footer.newBuilder(fileTail.getFooter()); + footerBuilder.clearStatistics(); + fileTailBuilder.setFooter(footerBuilder.build()); + OrcProto.FileTail result = fileTailBuilder.build(); + return result; + } +} diff --git a/orc/src/java/org/apache/orc/impl/ReaderImpl.java b/orc/src/java/org/apache/orc/impl/ReaderImpl.java index 1dd5e43..ae67e1d 100644 --- a/orc/src/java/org/apache/orc/impl/ReaderImpl.java +++ b/orc/src/java/org/apache/orc/impl/ReaderImpl.java @@ -27,16 +27,16 @@ import java.util.List; import java.util.Set; +import org.apache.hadoop.fs.FileStatus; +import org.apache.orc.CompressionKind; +import org.apache.orc.FileMetadata; import org.apache.orc.OrcFile; -import org.apache.orc.OrcUtils; import org.apache.orc.Reader; import org.apache.orc.RecordReader; import org.apache.orc.TypeDescription; import org.apache.orc.ColumnStatistics; import org.apache.orc.CompressionCodec; import org.apache.orc.FileFormatException; -import org.apache.orc.FileMetaInfo; -import org.apache.orc.FileMetadata; import org.apache.orc.StripeInformation; import org.apache.orc.StripeStatistics; import org.slf4j.Logger; @@ -63,27 +63,25 @@ private final long maxLength; protected final Path path; protected final org.apache.orc.CompressionKind compressionKind; - protected final CompressionCodec codec; - protected final int bufferSize; - private final List stripeStats; + protected CompressionCodec codec; + protected int bufferSize; + protected OrcProto.Metadata metadata; + private List stripeStats; private final int metadataSize; protected final List types; - private final TypeDescription schema; + private TypeDescription schema; private final List userMetadata; private final List fileStats; private final List stripes; protected final int rowIndexStride; private final long contentLength, numberOfRows; - private long deserializedSize = -1; protected final Configuration conf; private final List versionList; private final OrcFile.WriterVersion writerVersion; - // Same for metastore cache - maintains the same background buffer, but includes postscript. - // This will only be set if the file footer/metadata was read from disk. - private final ByteBuffer footerMetaAndPsBuffer; + protected OrcTail tail; public static class StripeInformationImpl implements StripeInformation { @@ -207,6 +205,11 @@ public long getContentLength() { } @Override + public OrcProto.FileTail getFileTail() { + return tail.getFileTail(); + } + + @Override public int getRowIndexStride() { return rowIndexStride; } @@ -261,6 +264,32 @@ protected static void ensureOrcFooter(FSDataInputStream in, } /** + * Ensure this is an ORC file to prevent users from trying to read text + * files or RC files as ORC files. + * @param psLen the postscript length + * @param buffer the tail of the file + * @throws IOException + */ + protected static void ensureOrcFooter(ByteBuffer buffer, int psLen) throws IOException { + int magicLength = OrcFile.MAGIC.length(); + int fullLength = magicLength + 1; + if (psLen < fullLength || buffer.remaining() < fullLength) { + throw new FileFormatException("Malformed ORC file. Invalid postscript length " + psLen); + } + + int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - fullLength; + byte[] array = buffer.array(); + // now look for the magic string at the end of the postscript. + if (!Text.decode(array, offset, magicLength).equals(OrcFile.MAGIC)) { + // if it isn't there, this may be 0.11.0 version of the ORC file. + // Read the first 3 bytes from the buffer to check for the header + if (!Text.decode(buffer.array(), 0, magicLength).equals(OrcFile.MAGIC)) { + throw new FileFormatException("Malformed ORC file. Invalid postscript length " + psLen); + } + } + } + + /** * Build a version string out of an array. * @param version the version number as a list * @return the human readable form of the version string @@ -312,11 +341,15 @@ public ReaderImpl(Path path, OrcFile.ReaderOptions options) throws IOException { if (fs == null) { fs = path.getFileSystem(options.getConfiguration()); } + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("hdfs")) { + LOG.info("HDFS FileSystem counters before ORC reader creation: {}", statistics.toString()); + } + } this.fileSystem = fs; this.path = path; this.conf = options.getConfiguration(); this.maxLength = options.getMaxLength(); - FileMetadata fileMetadata = options.getFileMetadata(); if (fileMetadata != null) { this.compressionKind = fileMetadata.getCompressionKind(); @@ -334,40 +367,30 @@ public ReaderImpl(Path path, OrcFile.ReaderOptions options) throws IOException { this.fileStats = fileMetadata.getFileStats(); this.stripes = fileMetadata.getStripes(); this.userMetadata = null; // not cached and not needed here - this.footerMetaAndPsBuffer = null; } else { - FileMetaInfo footerMetaData; - if (options.getFileMetaInfo() != null) { - footerMetaData = options.getFileMetaInfo(); - this.footerMetaAndPsBuffer = null; + OrcTail orcTail = options.getOrcTail(); + if (orcTail == null) { + tail = extractFileTail(fs, path, options.getMaxLength()); + options.orcTail(tail); } else { - footerMetaData = extractMetaInfoFromFooter(fs, path, - options.getMaxLength()); - this.footerMetaAndPsBuffer = footerMetaData.footerMetaAndPsBuffer; + tail = orcTail; } - options.fileMetaInfo(footerMetaData); - MetaInfoObjExtractor rInfo = - new MetaInfoObjExtractor(footerMetaData.compressionType, - footerMetaData.bufferSize, - footerMetaData.metadataSize, - footerMetaData.footerBuffer - ); - this.compressionKind = rInfo.compressionKind; - this.codec = rInfo.codec; - this.bufferSize = rInfo.bufferSize; - this.metadataSize = rInfo.metadataSize; - this.stripeStats = rInfo.metadata.getStripeStatsList(); - this.types = rInfo.footer.getTypesList(); - this.rowIndexStride = rInfo.footer.getRowIndexStride(); - this.contentLength = rInfo.footer.getContentLength(); - this.numberOfRows = rInfo.footer.getNumberOfRows(); - this.userMetadata = rInfo.footer.getMetadataList(); - this.fileStats = rInfo.footer.getStatisticsList(); - this.versionList = footerMetaData.versionList; - this.writerVersion = footerMetaData.writerVersion; - this.stripes = convertProtoStripesToStripes(rInfo.footer.getStripesList()); - } - this.schema = OrcUtils.convertTypeFromProtobuf(this.types, 0); + this.compressionKind = tail.getCompressionKind(); + this.codec = tail.getCompressionCodec(); + this.bufferSize = tail.getCompressionBufferSize(); + this.metadataSize = tail.getMetadataSize(); + this.versionList = tail.getPostScript().getVersionList(); + this.types = tail.getFooter().getTypesList(); + this.rowIndexStride = tail.getFooter().getRowIndexStride(); + this.contentLength = tail.getFooter().getContentLength(); + this.numberOfRows = tail.getFooter().getNumberOfRows(); + this.userMetadata = tail.getFooter().getMetadataList(); + this.fileStats = tail.getFooter().getStatisticsList(); + this.writerVersion = tail.getWriterVersion(); + this.stripes = tail.getStripes(); + this.schema = tail.getTypeDescription(); + this.stripeStats = tail.getStripeStatisticsProto(); + } } /** @@ -392,7 +415,7 @@ public ReaderImpl(Path path, OrcFile.ReaderOptions options) throws IOException { Lists.newArrayList(new BufferChunk(bb, 0)), footerSize, codec, bufferSize)); } - private static OrcProto.Metadata extractMetadata(ByteBuffer bb, int metadataAbsPos, + public static OrcProto.Metadata extractMetadata(ByteBuffer bb, int metadataAbsPos, int metadataSize, CompressionCodec codec, int bufferSize) throws IOException { bb.position(metadataAbsPos); bb.limit(metadataAbsPos + metadataSize); @@ -425,22 +448,47 @@ public ReaderImpl(Path path, OrcFile.ReaderOptions options) throws IOException { return ps; } - private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs, - Path path, - long maxFileLength - ) throws IOException { + public static OrcTail extractFileTail(ByteBuffer buffer, long fileLength) throws IOException { + int readSize = buffer.limit(); + int psLen = buffer.get(readSize - 1) & 0xff; + int psOffset = readSize - 1 - psLen; + ensureOrcFooter(buffer, psLen); + byte[] psBuffer = new byte[psLen]; + System.arraycopy(buffer.array(), psOffset, psBuffer, 0, psLen); + OrcProto.PostScript ps = OrcProto.PostScript.parseFrom(psBuffer); + int footerSize = (int) ps.getFooterLength(); + CompressionCodec codec = WriterImpl + .createCodec(CompressionKind.valueOf(ps.getCompression().name())); + OrcProto.Footer footer = extractFooter(buffer, + (int) (buffer.position() + ps.getMetadataLength()), + footerSize, codec, (int) ps.getCompressionBlockSize()); + OrcProto.FileTail.Builder fileTailBuilder = OrcProto.FileTail.newBuilder() + .setPostscriptLength(psLen) + .setPostscript(ps) + .setFooter(footer) + .setFileLength(fileLength); + return new OrcTail(fileTailBuilder.build(), buffer.slice()); + } + + protected OrcTail extractFileTail(FileSystem fs, Path path, + long maxFileLength) throws IOException { FSDataInputStream file = fs.open(path); - ByteBuffer buffer = null, fullFooterBuffer = null; - OrcProto.PostScript ps = null; - OrcFile.WriterVersion writerVersion = null; + ByteBuffer buffer; + OrcProto.PostScript ps; + OrcProto.FileTail.Builder fileTailBuilder = OrcProto.FileTail.newBuilder(); + long modificationTime; try { // figure out the size of the file using the option or filesystem long size; if (maxFileLength == Long.MAX_VALUE) { - size = fs.getFileStatus(path).getLen(); + FileStatus fileStatus = fs.getFileStatus(path); + size = fileStatus.getLen(); + modificationTime = fileStatus.getModificationTime(); } else { size = maxFileLength; + modificationTime = -1; } + fileTailBuilder.setFileLength(size); //read last bytes into buffer to get PostScript int readSize = (int) Math.min(size, DIRECTORY_SIZE_GUESS); @@ -456,10 +504,12 @@ private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs, ensureOrcFooter(file, path, psLen, buffer); int psOffset = readSize - 1 - psLen; ps = extractPostScript(buffer, path, psLen, psOffset); + bufferSize = (int) ps.getCompressionBlockSize(); + codec = WriterImpl.createCodec(CompressionKind.valueOf(ps.getCompression().name())); + fileTailBuilder.setPostscriptLength(psLen).setPostscript(ps); int footerSize = (int) ps.getFooterLength(); int metadataSize = (int) ps.getMetadataLength(); - writerVersion = extractWriterVersion(ps); //check if extra bytes need to be read int extra = Math.max(0, psLen + 1 + footerSize + metadataSize - readSize); @@ -473,17 +523,23 @@ private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs, extraBuf.put(buffer); buffer = extraBuf; buffer.position(0); - fullFooterBuffer = buffer.slice(); buffer.limit(footerSize + metadataSize); + readSize += extra; + psOffset = readSize - 1 - psLen; } else { //footer is already in the bytes in buffer, just adjust position, length buffer.position(psOffset - footerSize - metadataSize); - fullFooterBuffer = buffer.slice(); buffer.limit(psOffset); } - // remember position for later TODO: what later? this comment is useless buffer.mark(); + int footerOffset = psOffset - footerSize; + buffer.position(footerOffset); + ByteBuffer footerBuffer = buffer.slice(); + buffer.reset(); + OrcProto.Footer footer = extractFooter(footerBuffer, 0, footerSize, + codec, bufferSize); + fileTailBuilder.setFooter(footer); } finally { try { file.close(); @@ -492,68 +548,12 @@ private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs, } } - return new FileMetaInfo( - ps.getCompression().toString(), - (int) ps.getCompressionBlockSize(), - (int) ps.getMetadataLength(), - buffer, - ps.getVersionList(), - writerVersion, - fullFooterBuffer - ); - } - - protected static OrcFile.WriterVersion extractWriterVersion(OrcProto.PostScript ps) { - return (ps.hasWriterVersion() - ? getWriterVersion(ps.getWriterVersion()) : OrcFile.WriterVersion.ORIGINAL); - } - - protected static List convertProtoStripesToStripes( - List stripes) { - List result = new ArrayList(stripes.size()); - for (OrcProto.StripeInformation info : stripes) { - result.add(new StripeInformationImpl(info)); - } - return result; - } - - /** - * MetaInfoObjExtractor - has logic to create the values for the fields in ReaderImpl - * from serialized fields. - * As the fields are final, the fields need to be initialized in the constructor and - * can't be done in some helper function. So this helper class is used instead. - * - */ - private static class MetaInfoObjExtractor{ - final org.apache.orc.CompressionKind compressionKind; - final CompressionCodec codec; - final int bufferSize; - final int metadataSize; - final OrcProto.Metadata metadata; - final OrcProto.Footer footer; - - MetaInfoObjExtractor(String codecStr, int bufferSize, int metadataSize, - ByteBuffer footerBuffer) throws IOException { - - this.compressionKind = org.apache.orc.CompressionKind.valueOf(codecStr.toUpperCase()); - this.bufferSize = bufferSize; - this.codec = WriterImpl.createCodec(compressionKind); - this.metadataSize = metadataSize; - - int position = footerBuffer.position(); - int footerBufferSize = footerBuffer.limit() - footerBuffer.position() - metadataSize; - - this.metadata = extractMetadata(footerBuffer, position, metadataSize, codec, bufferSize); - this.footer = extractFooter( - footerBuffer, position + metadataSize, footerBufferSize, codec, bufferSize); - - footerBuffer.position(position); - } + return new OrcTail(fileTailBuilder.build(), buffer.slice(), modificationTime); } @Override public ByteBuffer getSerializedFileFooter() { - return footerMetaAndPsBuffer; + return tail.getSerializedTail(); } @Override @@ -722,7 +722,11 @@ private int getLastIdx() { } @Override - public List getStripeStatistics() { + public List getStripeStatistics() throws IOException { + if (stripeStats == null && metadata == null) { + metadata = extractMetadata(tail.getSerializedTail(), 0, metadataSize, codec, bufferSize); + stripeStats = metadata.getStripeStatsList(); + } List result = new ArrayList<>(); for (OrcProto.StripeStatistics ss : stripeStats) { result.add(new StripeStatistics(ss.getColStatsList())); diff --git a/orc/src/protobuf/orc_proto.proto b/orc/src/protobuf/orc_proto.proto index f4935b4..3b2cf1b 100644 --- a/orc/src/protobuf/orc_proto.proto +++ b/orc/src/protobuf/orc_proto.proto @@ -220,3 +220,11 @@ message PostScript { // Leave this last in the record optional string magic = 8000; } + +// The contents of the file tail that must be serialized. +message FileTail { + optional PostScript postscript = 1; + optional Footer footer = 2; + optional uint64 fileLength = 3; + optional uint64 postscriptLength = 4; +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ExternalCache.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ExternalCache.java index 6556fbf..2fbe1d4 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ExternalCache.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ExternalCache.java @@ -28,7 +28,6 @@ import java.util.Map; import java.util.Map.Entry; -import org.apache.commons.codec.binary.Hex; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.hive.conf.HiveConf; @@ -36,7 +35,6 @@ import org.apache.hadoop.hive.metastore.api.MetadataPpdResult; import org.apache.hadoop.hive.ql.exec.SerializationUtilities; import org.apache.hadoop.hive.ql.io.HdfsUtils; -import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.FileInfo; import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.FooterCache; import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg; import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; @@ -44,7 +42,7 @@ import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId; -import org.apache.orc.FileMetaInfo; +import org.apache.orc.impl.OrcTail; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -74,13 +72,12 @@ public ExternalCache(LocalCache lc, ExternalFooterCachesByConf efcf) { } @Override - public void put(Long fileId, FileStatus file, FileMetaInfo fileMetaInfo, Reader orcReader) - throws IOException { - localCache.put(fileId, file, fileMetaInfo, orcReader); - if (fileId != null) { + public void put(OrcInputFormat.FooterCacheKey key, OrcTail orcTail) throws IOException { + localCache.put(key.getPath(), orcTail); + if (key.getFileId() != null) { try { - externalCacheSrc.getCache(conf).putFileMetadata(Lists.newArrayList(fileId), - Lists.newArrayList(((ReaderImpl)orcReader).getSerializedFileFooter())); + externalCacheSrc.getCache(conf).putFileMetadata(Lists.newArrayList(key.getFileId()), + Lists.newArrayList(orcTail.getSerializedTail())); } catch (HiveException e) { throw new IOException(e); } @@ -108,7 +105,7 @@ public void configure(HiveConf queryConfig) { @Override public void getAndValidate(List files, boolean isOriginal, - FileInfo[] result, ByteBuffer[] ppdResult) throws IOException, HiveException { + OrcTail[] result, ByteBuffer[] ppdResult) throws IOException, HiveException { assert result.length == files.size(); assert ppdResult == null || ppdResult.length == files.size(); // First, check the local cache. @@ -155,7 +152,7 @@ public void getAndValidate(List files, boolean isOriginal, } private int getAndVerifyIndex(HashMap posMap, - List files, FileInfo[] result, Long fileId) { + List files, OrcTail[] result, Long fileId) { int ix = posMap.get(fileId); assert result[ix] == null; assert fileId != null && fileId.equals(files.get(ix).getFileId()); @@ -163,9 +160,9 @@ private int getAndVerifyIndex(HashMap posMap, } private boolean processBbResult( - ByteBuffer bb, int ix, HdfsFileStatusWithId file, FileInfo[] result) throws IOException { + ByteBuffer bb, int ix, HdfsFileStatusWithId file, OrcTail[] result) throws IOException { if (bb == null) return true; - result[ix] = createFileInfoFromMs(file, bb); + result[ix] = createOrcTailFromMs(file, bb); if (result[ix] == null) { return false; } @@ -175,12 +172,12 @@ private boolean processBbResult( } private void processPpdResult(MetadataPpdResult mpr, HdfsFileStatusWithId file, - int ix, FileInfo[] result, ByteBuffer[] ppdResult) throws IOException { + int ix, OrcTail[] result, ByteBuffer[] ppdResult) throws IOException { if (mpr == null) return; // This file is unknown to metastore. ppdResult[ix] = mpr.isSetIncludeBitset() ? mpr.bufferForIncludeBitset() : NO_SPLIT_AFTER_PPD; if (mpr.isSetMetadata()) { - result[ix] = createFileInfoFromMs(file, mpr.bufferForMetadata()); + result[ix] = createOrcTailFromMs(file, mpr.bufferForMetadata()); if (result[ix] != null) { localCache.put(file.getFileStatus().getPath(), result[ix]); } @@ -188,7 +185,7 @@ private void processPpdResult(MetadataPpdResult mpr, HdfsFileStatusWithId file, } private List determineFileIdsToQuery( - List files, FileInfo[] result, HashMap posMap) { + List files, OrcTail[] result, HashMap posMap) { for (int i = 0; i < result.length; ++i) { if (result[i] != null) continue; HdfsFileStatusWithId file = files.get(i); @@ -293,25 +290,12 @@ public static void translateSargToTableColIndexes( } } - private static FileInfo createFileInfoFromMs( + private static OrcTail createOrcTailFromMs( HdfsFileStatusWithId file, ByteBuffer bb) throws IOException { if (bb == null) return null; FileStatus fs = file.getFileStatus(); - ReaderImpl.FooterInfo fi = null; ByteBuffer copy = bb.duplicate(); - try { - fi = ReaderImpl.extractMetaInfoFromFooter(copy, fs.getPath()); - } catch (Exception ex) { - byte[] data = new byte[bb.remaining()]; - System.arraycopy(bb.array(), bb.arrayOffset() + bb.position(), data, 0, data.length); - String msg = "Failed to parse the footer stored in cache for file ID " - + file.getFileId() + " " + bb + " [ " + Hex.encodeHexString(data) + " ]"; - LOG.error(msg, ex); - return null; - } - return new FileInfo(fs.getModificationTime(), fs.getLen(), fi.getStripes(), fi.getMetadata(), - fi.getFooter().getTypesList(), fi.getFooter().getStatisticsList(), fi.getFileMetaInfo(), - fi.getFileMetaInfo().writerVersion, file.getFileId()); + return new OrcTail(null, copy, fs.getModificationTime()); } private static final class Baos extends ByteArrayOutputStream { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/LocalCache.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/LocalCache.java index 8151e52..fe93656 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/LocalCache.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/LocalCache.java @@ -24,29 +24,23 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.FileInfo; -import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.FooterCache; -import org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId; -import org.apache.orc.FileMetaInfo; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.shims.HadoopShims; +import org.apache.orc.impl.OrcTail; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; -/** Local footer cache using Guava. Stores convoluted Java objects. */ -class LocalCache implements FooterCache { +class LocalCache implements OrcInputFormat.FooterCache { private static final Logger LOG = LoggerFactory.getLogger(LocalCache.class); - private static boolean isDebugEnabled = LOG.isDebugEnabled(); + private final Cache cache; - private final Cache cache; - - public LocalCache(int numThreads, int cacheStripeDetailsSize) { + LocalCache(int numThreads, int cacheStripeDetailsSize) { cache = CacheBuilder.newBuilder() .concurrencyLevel(numThreads) - .initialCapacity(cacheStripeDetailsSize) .maximumSize(cacheStripeDetailsSize) - .softValues() .build(); } @@ -55,49 +49,50 @@ public void clear() { cache.cleanUp(); } - public void getAndValidate(List files, boolean isOriginal, - FileInfo[] result, ByteBuffer[] ppdResult) throws IOException { + public void put(Path path, OrcTail tail) { + cache.put(path, tail); + } + + public OrcTail get(Path path) { + return cache.getIfPresent(path); + } + + @Override + public void getAndValidate(final List files, + final boolean isOriginal, + final OrcTail[] result, final ByteBuffer[] ppdResult) + throws IOException, HiveException { // TODO: should local cache also be by fileId? Preserve the original logic for now. assert result.length == files.size(); int i = -1; - for (HdfsFileStatusWithId fileWithId : files) { + for (HadoopShims.HdfsFileStatusWithId fileWithId : files) { ++i; FileStatus file = fileWithId.getFileStatus(); Path path = file.getPath(); - Long fileId = fileWithId.getFileId(); - FileInfo fileInfo = cache.getIfPresent(path); - if (isDebugEnabled) { - LOG.debug("Info " + (fileInfo == null ? "not " : "") + "cached for path: " + path); + OrcTail tail = cache.getIfPresent(path); + if (LOG.isDebugEnabled()) { + LOG.debug("Serialized tail " + (tail == null ? "not " : "") + "cached for path: " + path); } - if (fileInfo == null) continue; - if ((fileId != null && fileInfo.fileId != null && fileId == fileInfo.fileId) - || (fileInfo.modificationTime == file.getModificationTime() && - fileInfo.size == file.getLen())) { - result[i] = fileInfo; + if (tail == null) continue; + if (tail != null && file.getLen() == tail.getFileTail().getFileLength() + && file.getModificationTime() == tail.getFileModificationTime()) { + result[i] = tail; continue; } // Invalidate cache.invalidate(path); - if (isDebugEnabled) { + if (LOG.isDebugEnabled()) { LOG.debug("Meta-Info for : " + path + " changed. CachedModificationTime: " - + fileInfo.modificationTime + ", CurrentModificationTime: " - + file.getModificationTime() + ", CachedLength: " + fileInfo.size + + tail.getFileModificationTime() + ", CurrentModificationTime: " + + file.getModificationTime() + ", CachedLength: " + tail.getFileTail().getFileLength() + ", CurrentLength: " + file.getLen()); } } } - public void put(Path path, FileInfo fileInfo) { - cache.put(path, fileInfo); - } - @Override - public void put(Long fileId, FileStatus file, FileMetaInfo fileMetaInfo, Reader orcReader) - throws IOException { - cache.put(file.getPath(), new FileInfo(file.getModificationTime(), file.getLen(), - orcReader.getStripes(), orcReader.getStripeStatistics(), orcReader.getTypes(), - orcReader.getOrcProtoFileStatistics(), fileMetaInfo, orcReader.getWriterVersion(), - fileId)); + public boolean hasPpd() { + return false; } @Override @@ -106,7 +101,8 @@ public boolean isBlocking() { } @Override - public boolean hasPpd() { - return false; + public void put(final OrcInputFormat.FooterCacheKey cacheKey, final OrcTail orcTail) + throws IOException { + put(cacheKey.getPath(), orcTail); } } \ No newline at end of file diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java index 58e5da1..5366020 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java @@ -30,6 +30,7 @@ import org.apache.orc.FileMetadata; import org.apache.orc.impl.MemoryManager; import org.apache.orc.TypeDescription; +import org.apache.orc.impl.OrcTail; /** * Contains factory methods to read or write ORC files. @@ -72,6 +73,11 @@ public ReaderOptions fileMetadata(FileMetadata metadata) { super.fileMetadata(metadata); return this; } + + public ReaderOptions orcTail(OrcTail orcTail) { + super.orcTail(orcTail); + return this; + } } public static ReaderOptions readerOptions(Configuration conf) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFileFormatProxy.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFileFormatProxy.java index c9c7b5a..52b60db 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFileFormatProxy.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFileFormatProxy.java @@ -29,6 +29,8 @@ import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.orc.OrcProto; import org.apache.orc.StripeInformation; +import org.apache.orc.StripeStatistics; +import org.apache.orc.impl.OrcTail; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,15 +43,16 @@ public SplitInfos applySargToMetadata( SearchArgument sarg, ByteBuffer fileMetadata) throws IOException { // TODO: ideally we should store shortened representation of only the necessary fields // in HBase; it will probably require custom SARG application code. - ReaderImpl.FooterInfo fi = ReaderImpl.extractMetaInfoFromFooter(fileMetadata, null); - OrcProto.Footer footer = fi.getFooter(); + // FIXME: how do we get file length here? + OrcTail orcTail = ReaderImpl.extractFileTail(fileMetadata, -1); + OrcProto.Footer footer = orcTail.getFooter(); int stripeCount = footer.getStripesCount(); boolean[] result = OrcInputFormat.pickStripesViaTranslatedSarg( - sarg, fi.getFileMetaInfo().getWriterVersion(), - footer.getTypesList(), fi.getMetadata(), stripeCount); + sarg, orcTail.getWriterVersion(), + footer.getTypesList(), orcTail.getStripeStatistics(), stripeCount); // For ORC case, send the boundaries of the stripes so we don't have to send the footer. SplitInfos.Builder sb = SplitInfos.newBuilder(); - List stripes = fi.getStripes(); + List stripes = orcTail.getStripes(); boolean isEliminated = true; for (int i = 0; i < result.length; ++i) { if (result != null && !result[i]) continue; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index d7a8c2f..c659487 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -53,11 +53,11 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; import org.apache.orc.ColumnStatistics; -import org.apache.orc.FileMetaInfo; import org.apache.orc.OrcUtils; import org.apache.orc.StripeInformation; import org.apache.orc.StripeStatistics; import org.apache.orc.TypeDescription; +import org.apache.orc.impl.OrcTail; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -648,20 +648,20 @@ public AcidDirInfo(FileSystem fs, Path splitPath, Directory acidInfo, private final Context context; private final FileSystem fs; private final HdfsFileStatusWithId fileWithId; - private final FileInfo fileInfo; + private final OrcTail orcTail; private final boolean isOriginal; private final List deltas; private final boolean hasBase; private final ByteBuffer ppdResult; - SplitInfo(Context context, FileSystem fs, HdfsFileStatusWithId fileWithId, FileInfo fileInfo, + SplitInfo(Context context, FileSystem fs, HdfsFileStatusWithId fileWithId, OrcTail orcTail, boolean isOriginal, List deltas, boolean hasBase, Path dir, boolean[] covered, ByteBuffer ppdResult) throws IOException { super(dir, context.numBuckets, deltas, covered); this.context = context; this.fs = fs; this.fileWithId = fileWithId; - this.fileInfo = fileInfo; + this.orcTail = orcTail; this.isOriginal = isOriginal; this.deltas = deltas; this.hasBase = hasBase; @@ -669,11 +669,11 @@ public AcidDirInfo(FileSystem fs, Path splitPath, Directory acidInfo, } @VisibleForTesting - public SplitInfo(Context context, FileSystem fs, FileStatus fileStatus, FileInfo fileInfo, + public SplitInfo(Context context, FileSystem fs, FileStatus fileStatus, OrcTail orcTail, boolean isOriginal, ArrayList deltas, boolean hasBase, Path dir, boolean[] covered) throws IOException { this(context, fs, AcidUtils.createOriginalObj(null, fileStatus), - fileInfo, isOriginal, deltas, hasBase, dir, covered, null); + orcTail, isOriginal, deltas, hasBase, dir, covered, null); } } @@ -728,13 +728,13 @@ public ETLSplitStrategy(Context context, FileSystem fs, Path dir, FooterCache cache = context.cacheStripeDetails ? ((deltas == null || deltas.isEmpty()) ? context.footerCache : Context.localCache) : null; if (cache != null) { - FileInfo[] infos = new FileInfo[files.size()]; + OrcTail[] orcTails = new OrcTail[files.size()]; ByteBuffer[] ppdResults = null; if (cache.hasPpd()) { ppdResults = new ByteBuffer[files.size()]; } try { - cache.getAndValidate(files, isOriginal, infos, ppdResults); + cache.getAndValidate(files, isOriginal, orcTails, ppdResults); } catch (HiveException e) { throw new IOException(e); } @@ -745,16 +745,16 @@ public ETLSplitStrategy(Context context, FileSystem fs, Path dir, dir = dirs.get(++dirIx); filesInDirCount = dir.fileCount; } - FileInfo info = infos[i]; + OrcTail orcTail = orcTails[i]; ByteBuffer ppdResult = ppdResults == null ? null : ppdResults[i]; HdfsFileStatusWithId file = files.get(i); - if (info != null) { + if (orcTail != null) { // Cached copy is valid context.cacheHitCounter.incrementAndGet(); } // Ignore files eliminated by PPD, or of 0 length. if (ppdResult != FooterCache.NO_SPLIT_AFTER_PPD && file.getFileStatus().getLen() > 0) { - result.add(new SplitInfo(context, dir.fs, file, info, + result.add(new SplitInfo(context, dir.fs, file, orcTail, isOriginal, deltas, true, dir.dir, covered, ppdResult)); } } @@ -921,7 +921,7 @@ public BISplitStrategy(Context context, FileSystem fs, for (Map.Entry entry : blockOffsets.entrySet()) { OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey, entry.getKey(), entry.getValue().getLength(), entry.getValue().getHosts(), null, isOriginal, true, - deltas, -1); + deltas, -1, fileStatus.getLen()); splits.add(orcSplit); } } @@ -963,7 +963,7 @@ public ACIDSplitStrategy(Path dir, int numBuckets, List deltas, b if (!deltas.isEmpty()) { for (int b = 0; b < numBuckets; ++b) { if (!covered[b]) { - splits.add(new OrcSplit(dir, null, b, 0, new String[0], null, false, false, deltas, -1)); + splits.add(new OrcSplit(dir, null, b, 0, new String[0], null, false, false, deltas, -1, -1)); } } } @@ -1054,9 +1054,8 @@ private AcidDirInfo callInternal() throws IOException { private final Long fsFileId; private final long blockSize; private final TreeMap locations; - private final FileInfo fileInfo; + private OrcTail orcTail; private List stripes; - private FileMetaInfo fileMetaInfo; private List stripeStats; private List types; private boolean[] includedCols; @@ -1078,7 +1077,7 @@ public SplitGenerator(SplitInfo splitInfo, UserGroupInformation ugi, this.file = splitInfo.fileWithId.getFileStatus(); this.fsFileId = splitInfo.fileWithId.getFileId(); this.blockSize = this.file.getBlockSize(); - this.fileInfo = splitInfo.fileInfo; + this.orcTail = splitInfo.orcTail; // TODO: potential DFS call this.locations = SHIMS.getLocationsWithOffset(fs, file); this.isOriginal = splitInfo.isOriginal; @@ -1129,11 +1128,10 @@ static long getOverlap(long offset1, long length1, * are written with large block sizes. * @param offset the start of the split * @param length the length of the split - * @param fileMetaInfo file metadata from footer and postscript + * @param orcTail orc tail * @throws IOException */ - OrcSplit createSplit(long offset, long length, - FileMetaInfo fileMetaInfo) throws IOException { + OrcSplit createSplit(long offset, long length, OrcTail orcTail) throws IOException { String[] hosts; Map.Entry startEntry = locations.floorEntry(offset); BlockLocation start = startEntry.getValue(); @@ -1196,7 +1194,7 @@ OrcSplit createSplit(long offset, long length, fileKey = new SyntheticFileId(file); } return new OrcSplit(file.getPath(), fileKey, offset, length, hosts, - fileMetaInfo, isOriginal, hasBase, deltas, scaledProjSize); + orcTail, isOriginal, hasBase, deltas, scaledProjSize, fileLen); } private static final class OffsetAndLength { // Java cruft; pair of long. @@ -1271,7 +1269,7 @@ public String toString() { int index = si.getIndex(); if (lastIdx >= 0 && lastIdx + 1 != index && current.offset != -1) { // Create split for the previous unfinished stripe. - splits.add(createSplit(current.offset, current.length, null)); + splits.add(createSplit(current.offset, current.length, orcTail)); current.offset = -1; } lastIdx = index; @@ -1305,16 +1303,16 @@ public String toString() { if (!includeStripe[idx]) { // create split for the previous unfinished stripe if (current.offset != -1) { - splits.add(createSplit(current.offset, current.length, fileMetaInfo)); + splits.add(createSplit(current.offset, current.length, orcTail)); current.offset = -1; } continue; } current = generateOrUpdateSplit( - splits, current, stripe.getOffset(), stripe.getLength(), fileMetaInfo); + splits, current, stripe.getOffset(), stripe.getLength(), orcTail); } - generateLastSplit(splits, current, fileMetaInfo); + generateLastSplit(splits, current, orcTail); // Add uncovered ACID delta splits. splits.addAll(deltaSplits); @@ -1323,12 +1321,12 @@ public String toString() { private OffsetAndLength generateOrUpdateSplit( List splits, OffsetAndLength current, long offset, - long length, FileMetaInfo fileMetaInfo) throws IOException { + long length, OrcTail orcTail) throws IOException { // if we are working on a stripe, over the min stripe size, and // crossed a block boundary, cut the input split here. if (current.offset != -1 && current.length > context.minSize && (current.offset / blockSize != offset / blockSize)) { - splits.add(createSplit(current.offset, current.length, fileMetaInfo)); + splits.add(createSplit(current.offset, current.length, orcTail)); current.offset = -1; } // if we aren't building a split, start a new one. @@ -1339,59 +1337,41 @@ private OffsetAndLength generateOrUpdateSplit( current.length = (offset + length) - current.offset; } if (current.length >= context.maxSize) { - splits.add(createSplit(current.offset, current.length, fileMetaInfo)); + splits.add(createSplit(current.offset, current.length, orcTail)); current.offset = -1; } return current; } private void generateLastSplit(List splits, OffsetAndLength current, - FileMetaInfo fileMetaInfo) throws IOException { + OrcTail orcTail) throws IOException { if (current.offset == -1) return; - splits.add(createSplit(current.offset, current.length, fileMetaInfo)); + splits.add(createSplit(current.offset, current.length, orcTail)); } private void populateAndCacheStripeDetails() throws IOException { - // Only create OrcReader if we are missing some information. - List colStatsLocal; - List typesLocal; - if (fileInfo != null) { - stripes = fileInfo.stripeInfos; - stripeStats = fileInfo.stripeStats; - fileMetaInfo = fileInfo.fileMetaInfo; - typesLocal = types = fileInfo.types; - colStatsLocal = fileInfo.fileStats; - writerVersion = fileInfo.writerVersion; - // For multiple runs, in case sendSplitsInFooter changes - if (fileMetaInfo == null && context.footerInSplits) { - Reader orcReader = createOrcReader(); - fileInfo.fileMetaInfo = ((ReaderImpl) orcReader).getFileMetaInfo(); - assert fileInfo.stripeStats != null && fileInfo.types != null - && fileInfo.writerVersion != null; - // We assume that if we needed to create a reader, we need to cache it to meta cache. - // This will also needlessly overwrite it in local cache for now. - context.footerCache.put(fsFileId, file, fileInfo.fileMetaInfo, orcReader); - } - } else { - Reader orcReader = createOrcReader(); - stripes = orcReader.getStripes(); - typesLocal = types = orcReader.getTypes(); - colStatsLocal = orcReader.getOrcProtoFileStatistics(); - writerVersion = orcReader.getWriterVersion(); - stripeStats = orcReader.getStripeStatistics(); - fileMetaInfo = context.footerInSplits ? - ((ReaderImpl) orcReader).getFileMetaInfo() : null; + if (orcTail == null) { + Reader orcReader = OrcFile.createReader(file.getPath(), + OrcFile.readerOptions(context.conf) + .filesystem(fs) + .maxLength(file.getLen())); + orcTail = new OrcTail(orcReader.getFileTail(), orcReader.getSerializedFileFooter(), + file.getModificationTime()); if (context.cacheStripeDetails) { - context.footerCache.put(fsFileId, file, fileMetaInfo, orcReader); + context.footerCache.put(new FooterCacheKey(fsFileId, file.getPath()), orcTail); } } + stripes = orcTail.getStripes(); + stripeStats = orcTail.getStripeStatistics(); + types = orcTail.getTypes(); + writerVersion = orcTail.getWriterVersion(); includedCols = genIncludedColumns(types, context.conf, isOriginal); - projColsUncompressedSize = computeProjectionSize(typesLocal, colStatsLocal, includedCols, isOriginal); - } - - private Reader createOrcReader() throws IOException { - return OrcFile.createReader(file.getPath(), - OrcFile.readerOptions(context.conf).filesystem(fs).maxLength(file.getLen())); + List fileColStats = orcTail.getFooter().getStatisticsList(); + projColsUncompressedSize = computeProjectionSize(types, fileColStats, includedCols, + isOriginal); + if (!context.footerInSplits) { + orcTail = null; + } } private long computeProjectionSize(List types, @@ -1595,40 +1575,6 @@ private static void scheduleSplits(ETLSplitStrategy splitStrategy, Context conte return result.toArray(new InputSplit[result.size()]); } - /** - * FileInfo. - * - * Stores information relevant to split generation for an ORC File. - * - */ - static class FileInfo { - final long modificationTime; - final long size; - final Long fileId; - private final List stripeInfos; - private FileMetaInfo fileMetaInfo; - private final List stripeStats; - private final List fileStats; - private final List types; - private final OrcFile.WriterVersion writerVersion; - - - FileInfo(long modificationTime, long size, List stripeInfos, - List stripeStats, List types, - List fileStats, FileMetaInfo fileMetaInfo, - OrcFile.WriterVersion writerVersion, Long fileId) { - this.modificationTime = modificationTime; - this.size = size; - this.fileId = fileId; - this.stripeInfos = stripeInfos; - this.fileMetaInfo = fileMetaInfo; - this.stripeStats = stripeStats; - this.types = types; - this.fileStats = fileStats; - this.writerVersion = writerVersion; - } - } - @SuppressWarnings("unchecked") private org.apache.hadoop.mapred.RecordReader createVectorizedReader(InputSplit split, JobConf conf, Reporter reporter @@ -1643,18 +1589,22 @@ private static void scheduleSplits(ETLSplitStrategy splitStrategy, Context conte Reporter reporter) throws IOException { boolean vectorMode = Utilities.getUseVectorizedInputFileFormat(conf); boolean isAcidRead = isAcidRead(conf, inputSplit); - if (!isAcidRead) { if (vectorMode) { return createVectorizedReader(inputSplit, conf, reporter); } else { + OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(conf); + if (inputSplit instanceof OrcSplit) { + OrcSplit split = (OrcSplit) inputSplit; + readerOptions.maxLength(split.getFileLength()).orcTail(split.getOrcTail()); + } return new OrcRecordReader(OrcFile.createReader( ((FileSplit) inputSplit).getPath(), - OrcFile.readerOptions(conf)), conf, (FileSplit) inputSplit); + readerOptions), + conf, (FileSplit) inputSplit); } } - OrcSplit split = (OrcSplit) inputSplit; reporter.setStatus(inputSplit.toString()); Options options = new Options(conf).reporter(reporter); @@ -1759,7 +1709,12 @@ public float getProgress() throws IOException { if (split.hasBase()) { bucket = AcidUtils.parseBaseBucketFilename(split.getPath(), conf) .getBucket(); - reader = OrcFile.createReader(path, OrcFile.readerOptions(conf)); + OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(conf) + .maxLength(split.getFileLength()); + if (split.hasFooter()) { + readerOptions.orcTail(split.getOrcTail()); + } + reader = OrcFile.createReader(path, readerOptions); } else { bucket = (int) split.getStart(); reader = null; @@ -1982,16 +1937,32 @@ private static boolean isStripeSatisfyPredicate( * Represents footer cache. */ public interface FooterCache { - static final ByteBuffer NO_SPLIT_AFTER_PPD = ByteBuffer.wrap(new byte[0]); + ByteBuffer NO_SPLIT_AFTER_PPD = ByteBuffer.wrap(new byte[0]); void getAndValidate(List files, boolean isOriginal, - FileInfo[] result, ByteBuffer[] ppdResult) throws IOException, HiveException; + OrcTail[] result, ByteBuffer[] ppdResult) throws IOException, HiveException; boolean hasPpd(); boolean isBlocking(); - void put(Long fileId, FileStatus file, FileMetaInfo fileMetaInfo, Reader orcReader) - throws IOException; + void put(FooterCacheKey cacheKey, OrcTail orcTail) throws IOException; } + public static class FooterCacheKey { + Long fileId; // used by external cache + Path path; // used by local cache + + FooterCacheKey(Long fileId, Path path) { + this.fileId = fileId; + this.path = path; + } + + public Long getFileId() { + return fileId; + } + + public Path getPath() { + return path; + } + } /** * Convert a Hive type property string that contains separated type names into a list of * TypeDescription objects. diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcNewSplit.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcNewSplit.java index 2e63aba..0c85827 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcNewSplit.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcNewSplit.java @@ -20,27 +20,25 @@ import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; -import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.hive.ql.io.AcidInputFormat; -import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.mapreduce.lib.input.FileSplit; -import org.apache.orc.FileMetaInfo; +import org.apache.orc.OrcProto; +import org.apache.orc.impl.OrcTail; /** * OrcFileSplit. Holds file meta info * */ public class OrcNewSplit extends FileSplit { - private FileMetaInfo fileMetaInfo; + private OrcTail orcTail; private boolean hasFooter; private boolean isOriginal; private boolean hasBase; private final List deltas = new ArrayList<>(); - private OrcFile.WriterVersion writerVersion; protected OrcNewSplit(){ //The FileSplit() constructor in hadoop 0.20 and 1.x is package private so can't use it. @@ -52,7 +50,7 @@ protected OrcNewSplit(){ public OrcNewSplit(OrcSplit inner) throws IOException { super(inner.getPath(), inner.getStart(), inner.getLength(), inner.getLocations()); - this.fileMetaInfo = inner.getFileMetaInfo(); + this.orcTail = inner.getOrcTail(); this.hasFooter = inner.hasFooter(); this.isOriginal = inner.isOriginal(); this.hasBase = inner.hasBase(); @@ -73,20 +71,11 @@ public void write(DataOutput out) throws IOException { delta.write(out); } if (hasFooter) { - // serialize FileMetaInfo fields - Text.writeString(out, fileMetaInfo.compressionType); - WritableUtils.writeVInt(out, fileMetaInfo.bufferSize); - WritableUtils.writeVInt(out, fileMetaInfo.metadataSize); - - // serialize FileMetaInfo field footer - ByteBuffer footerBuff = fileMetaInfo.footerBuffer; - footerBuff.reset(); - - // write length of buffer - WritableUtils.writeVInt(out, footerBuff.limit() - footerBuff.position()); - out.write(footerBuff.array(), footerBuff.position(), - footerBuff.limit() - footerBuff.position()); - WritableUtils.writeVInt(out, fileMetaInfo.writerVersion.getId()); + OrcProto.FileTail fileTail = orcTail.getMinimalFileTail(); + byte[] tailBuffer = fileTail.toByteArray(); + int tailLen = tailBuffer.length; + WritableUtils.writeVInt(out, tailLen); + out.write(tailBuffer); } } @@ -108,25 +97,16 @@ public void readFields(DataInput in) throws IOException { deltas.add(dmd); } if (hasFooter) { - // deserialize FileMetaInfo fields - String compressionType = Text.readString(in); - int bufferSize = WritableUtils.readVInt(in); - int metadataSize = WritableUtils.readVInt(in); - - // deserialize FileMetaInfo field footer - int footerBuffSize = WritableUtils.readVInt(in); - ByteBuffer footerBuff = ByteBuffer.allocate(footerBuffSize); - in.readFully(footerBuff.array(), 0, footerBuffSize); - OrcFile.WriterVersion writerVersion = - ReaderImpl.getWriterVersion(WritableUtils.readVInt(in)); - - fileMetaInfo = new FileMetaInfo(compressionType, bufferSize, - metadataSize, footerBuff, writerVersion); + int tailLen = WritableUtils.readVInt(in); + byte[] tailBuffer = new byte[tailLen]; + in.readFully(tailBuffer); + OrcProto.FileTail fileTail = OrcProto.FileTail.parseFrom(tailBuffer); + orcTail = new OrcTail(fileTail, null); } } - FileMetaInfo getFileMetaInfo(){ - return fileMetaInfo; + public OrcTail getOrcTail() { + return orcTail; } public boolean hasFooter() { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java index 407fd62..969e70e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java @@ -18,24 +18,26 @@ package org.apache.hadoop.hive.ql.io.orc; +import java.io.ByteArrayOutputStream; import java.io.DataInput; import java.io.DataOutput; +import java.io.DataOutputStream; import java.io.IOException; -import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; -import org.apache.orc.FileMetaInfo; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.ql.io.ColumnarSplit; import org.apache.hadoop.hive.ql.io.AcidInputFormat; +import org.apache.hadoop.hive.ql.io.ColumnarSplit; import org.apache.hadoop.hive.ql.io.LlapAwareSplit; import org.apache.hadoop.hive.ql.io.SyntheticFileId; -import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.mapred.FileSplit; - +import org.apache.orc.OrcProto; +import org.apache.orc.impl.OrcTail; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** @@ -43,13 +45,15 @@ * */ public class OrcSplit extends FileSplit implements ColumnarSplit, LlapAwareSplit { - private FileMetaInfo fileMetaInfo; + private static final Logger LOG = LoggerFactory.getLogger(OrcSplit.class); + private OrcTail orcTail; private boolean hasFooter; private boolean isOriginal; private boolean hasBase; private final List deltas = new ArrayList<>(); private long projColsUncompressedSize; private transient Object fileKey; + private long fileLen; static final int HAS_SYNTHETIC_FILEID_FLAG = 16; static final int HAS_LONG_FILEID_FLAG = 8; @@ -65,25 +69,40 @@ protected OrcSplit() { } public OrcSplit(Path path, Object fileId, long offset, long length, String[] hosts, - FileMetaInfo fileMetaInfo, boolean isOriginal, boolean hasBase, - List deltas, long projectedDataSize) { + OrcTail orcTail, boolean isOriginal, boolean hasBase, + List deltas, long projectedDataSize, long fileLen) { super(path, offset, length, hosts); // For HDFS, we could avoid serializing file ID and just replace the path with inode-based // path. However, that breaks bunch of stuff because Hive later looks up things by split path. this.fileKey = fileId; - this.fileMetaInfo = fileMetaInfo; - hasFooter = this.fileMetaInfo != null; + this.orcTail = orcTail; + hasFooter = this.orcTail != null; this.isOriginal = isOriginal; this.hasBase = hasBase; this.deltas.addAll(deltas); this.projColsUncompressedSize = projectedDataSize <= 0 ? length : projectedDataSize; + // setting file length to Long.MAX_VALUE will let orc reader read file length from file system + this.fileLen = fileLen <= 0 ? Long.MAX_VALUE : fileLen; } @Override public void write(DataOutput out) throws IOException { - //serialize path, offset, length using FileSplit - super.write(out); + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + DataOutputStream dos = new DataOutputStream(bos); + // serialize path, offset, length using FileSplit + super.write(dos); + int required = bos.size(); + + // write addition payload required for orc + writeAdditionalPayload(dos); + int additional = bos.size() - required; + + out.write(bos.toByteArray()); + LOG.info("Writing additional {} bytes to OrcSplit as payload. Required {} bytes.", additional, + required); + } + private void writeAdditionalPayload(final DataOutputStream out) throws IOException { boolean isFileIdLong = fileKey instanceof Long, isFileIdWritable = fileKey instanceof Writable; int flags = (hasBase ? BASE_FLAG : 0) | (isOriginal ? ORIGINAL_FLAG : 0) | @@ -96,26 +115,18 @@ public void write(DataOutput out) throws IOException { delta.write(out); } if (hasFooter) { - // serialize FileMetaInfo fields - Text.writeString(out, fileMetaInfo.compressionType); - WritableUtils.writeVInt(out, fileMetaInfo.bufferSize); - WritableUtils.writeVInt(out, fileMetaInfo.metadataSize); - - // serialize FileMetaInfo field footer - ByteBuffer footerBuff = fileMetaInfo.footerBuffer; - footerBuff.reset(); - - // write length of buffer - WritableUtils.writeVInt(out, footerBuff.limit() - footerBuff.position()); - out.write(footerBuff.array(), footerBuff.position(), - footerBuff.limit() - footerBuff.position()); - WritableUtils.writeVInt(out, fileMetaInfo.writerVersion.getId()); + OrcProto.FileTail fileTail = orcTail.getMinimalFileTail(); + byte[] tailBuffer = fileTail.toByteArray(); + int tailLen = tailBuffer.length; + WritableUtils.writeVInt(out, tailLen); + out.write(tailBuffer); } if (isFileIdLong) { out.writeLong(((Long)fileKey).longValue()); } else if (isFileIdWritable) { ((Writable)fileKey).write(out); } + out.writeLong(fileLen); } @Override @@ -141,20 +152,11 @@ public void readFields(DataInput in) throws IOException { deltas.add(dmd); } if (hasFooter) { - // deserialize FileMetaInfo fields - String compressionType = Text.readString(in); - int bufferSize = WritableUtils.readVInt(in); - int metadataSize = WritableUtils.readVInt(in); - - // deserialize FileMetaInfo field footer - int footerBuffSize = WritableUtils.readVInt(in); - ByteBuffer footerBuff = ByteBuffer.allocate(footerBuffSize); - in.readFully(footerBuff.array(), 0, footerBuffSize); - OrcFile.WriterVersion writerVersion = - ReaderImpl.getWriterVersion(WritableUtils.readVInt(in)); - - fileMetaInfo = new FileMetaInfo(compressionType, bufferSize, - metadataSize, footerBuff, writerVersion); + int tailLen = WritableUtils.readVInt(in); + byte[] tailBuffer = new byte[tailLen]; + in.readFully(tailBuffer); + OrcProto.FileTail fileTail = OrcProto.FileTail.parseFrom(tailBuffer); + orcTail = new OrcTail(fileTail, null); } if (hasLongFileId) { fileKey = in.readLong(); @@ -163,10 +165,11 @@ public void readFields(DataInput in) throws IOException { fileId.readFields(in); this.fileKey = fileId; } + fileLen = in.readLong(); } - FileMetaInfo getFileMetaInfo(){ - return fileMetaInfo; + public OrcTail getOrcTail() { + return orcTail; } public boolean hasFooter() { @@ -185,6 +188,10 @@ public boolean hasBase() { return deltas; } + public long getFileLength() { + return fileLen; + } + /** * If this method returns true, then for sure it is ACID. * However, if it returns false.. it could be ACID or non-ACID. @@ -215,7 +222,7 @@ public boolean canUseLlapIo() { @Override public String toString() { return "OrcSplit [" + getPath() + ", start=" + getStart() + ", length=" + getLength() - + ", isOriginal=" + isOriginal + ", hasBase=" + hasBase + ", deltas=" - + (deltas == null ? 0 : deltas.size()) + "]"; + + ", isOriginal=" + isOriginal + ", fileLength=" + fileLen + ", hasFooter=" + hasFooter + + ", hasBase=" + hasBase + ", deltas=" + (deltas == null ? 0 : deltas.size()) + "]"; } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java index 0b40fef..1bc781e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java @@ -20,47 +20,22 @@ import java.io.IOException; import java.nio.ByteBuffer; -import java.util.ArrayList; import java.util.Arrays; -import java.util.List; -import org.apache.orc.impl.BufferChunk; -import org.apache.orc.CompressionCodec; -import org.apache.orc.FileMetaInfo; -import org.apache.orc.FileMetadata; -import org.apache.orc.impl.InStream; -import org.apache.orc.StripeInformation; -import org.apache.orc.StripeStatistics; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.common.io.DiskRange; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.orc.OrcProto; - -import com.google.common.collect.Lists; -import com.google.protobuf.CodedInputStream; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class ReaderImpl extends org.apache.orc.impl.ReaderImpl implements Reader { private static final Logger LOG = LoggerFactory.getLogger(ReaderImpl.class); - private static final int DIRECTORY_SIZE_GUESS = 16 * 1024; - private final ObjectInspector inspector; - //serialized footer - Keeping this around for use by getFileMetaInfo() - // will help avoid cpu cycles spend in deserializing at cost of increased - // memory footprint. - private ByteBuffer footerByteBuffer; - // Same for metastore cache - maintains the same background buffer, but includes postscript. - // This will only be set if the file footer/metadata was read from disk. - private ByteBuffer footerMetaAndPsBuffer; - @Override public ObjectInspector getObjectInspector() { return inspector; @@ -83,268 +58,19 @@ public ObjectInspector getObjectInspector() { * @param options options for reading * @throws IOException */ - public ReaderImpl(Path path, - OrcFile.ReaderOptions options) throws IOException { + public ReaderImpl(Path path, OrcFile.ReaderOptions options) throws IOException { super(path, options); - FileMetadata fileMetadata = options.getFileMetadata(); - if (fileMetadata != null) { - this.inspector = OrcStruct.createObjectInspector(0, fileMetadata.getTypes()); - } else { - FileMetaInfo footerMetaData; - if (options.getFileMetaInfo() != null) { - footerMetaData = options.getFileMetaInfo(); - } else { - footerMetaData = extractMetaInfoFromFooter(fileSystem, path, - options.getMaxLength()); - } - this.footerMetaAndPsBuffer = footerMetaData.footerMetaAndPsBuffer; - MetaInfoObjExtractor rInfo = - new MetaInfoObjExtractor(footerMetaData.compressionType, - footerMetaData.bufferSize, - footerMetaData.metadataSize, - footerMetaData.footerBuffer - ); - this.footerByteBuffer = footerMetaData.footerBuffer; - this.inspector = rInfo.inspector; - } - } - - /** Extracts the necessary metadata from an externally store buffer (fullFooterBuffer). */ - public static FooterInfo extractMetaInfoFromFooter( - ByteBuffer bb, Path srcPath) throws IOException { - // Read the PostScript. Be very careful as some parts of this historically use bb position - // and some use absolute offsets that have to take position into account. - int baseOffset = bb.position(); - int lastByteAbsPos = baseOffset + bb.remaining() - 1; - int psLen = bb.get(lastByteAbsPos) & 0xff; - int psAbsPos = lastByteAbsPos - psLen; - OrcProto.PostScript ps = extractPostScript(bb, srcPath, psLen, psAbsPos); - assert baseOffset == bb.position(); - - // Extract PS information. - int footerSize = (int)ps.getFooterLength(), metadataSize = (int)ps.getMetadataLength(), - footerAbsPos = psAbsPos - footerSize, metadataAbsPos = footerAbsPos - metadataSize; - String compressionType = ps.getCompression().toString(); - CompressionCodec codec = - WriterImpl.createCodec(org.apache.orc.CompressionKind.valueOf - (compressionType)); - int bufferSize = (int)ps.getCompressionBlockSize(); - bb.position(metadataAbsPos); - bb.mark(); - - // Extract metadata and footer. - OrcProto.Metadata metadata = extractMetadata( - bb, metadataAbsPos, metadataSize, codec, bufferSize); - List stats = new ArrayList<>(metadata.getStripeStatsCount()); - for (OrcProto.StripeStatistics ss : metadata.getStripeStatsList()) { - stats.add(new StripeStatistics(ss.getColStatsList())); - } - OrcProto.Footer footer = extractFooter(bb, footerAbsPos, footerSize, codec, bufferSize); - bb.position(metadataAbsPos); - bb.limit(psAbsPos); - // TODO: do we need footer buffer here? FileInfo/FileMetaInfo is a mess... - FileMetaInfo fmi = new FileMetaInfo( - compressionType, bufferSize, metadataSize, bb, extractWriterVersion(ps)); - return new FooterInfo(stats, footer, fmi); - } - - private static OrcProto.Footer extractFooter(ByteBuffer bb, int footerAbsPos, - int footerSize, CompressionCodec codec, int bufferSize) throws IOException { - bb.position(footerAbsPos); - bb.limit(footerAbsPos + footerSize); - return OrcProto.Footer.parseFrom(InStream.createCodedInputStream("footer", - Lists.newArrayList(new BufferChunk(bb, 0)), footerSize, codec, bufferSize)); - } - - private static OrcProto.Metadata extractMetadata(ByteBuffer bb, int metadataAbsPos, - int metadataSize, CompressionCodec codec, int bufferSize) throws IOException { - bb.position(metadataAbsPos); - bb.limit(metadataAbsPos + metadataSize); - return OrcProto.Metadata.parseFrom(InStream.createCodedInputStream("metadata", - Lists.newArrayList(new BufferChunk(bb, 0)), metadataSize, codec, bufferSize)); - } - - private static OrcProto.PostScript extractPostScript(ByteBuffer bb, Path path, - int psLen, int psAbsOffset) throws IOException { - // TODO: when PB is upgraded to 2.6, newInstance(ByteBuffer) method should be used here. - assert bb.hasArray(); - CodedInputStream in = CodedInputStream.newInstance( - bb.array(), bb.arrayOffset() + psAbsOffset, psLen); - OrcProto.PostScript ps = OrcProto.PostScript.parseFrom(in); - checkOrcVersion(LOG, path, ps.getVersionList()); - - // Check compression codec. - switch (ps.getCompression()) { - case NONE: - break; - case ZLIB: - break; - case SNAPPY: - break; - case LZO: - break; - default: - throw new IllegalArgumentException("Unknown compression"); - } - return ps; - } - - private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs, - Path path, - long maxFileLength - ) throws IOException { - FSDataInputStream file = fs.open(path); - ByteBuffer buffer = null, fullFooterBuffer = null; - OrcProto.PostScript ps = null; - OrcFile.WriterVersion writerVersion = null; - try { - // figure out the size of the file using the option or filesystem - long size; - if (maxFileLength == Long.MAX_VALUE) { - size = fs.getFileStatus(path).getLen(); - } else { - size = maxFileLength; - } - - //read last bytes into buffer to get PostScript - int readSize = (int) Math.min(size, DIRECTORY_SIZE_GUESS); - buffer = ByteBuffer.allocate(readSize); - assert buffer.position() == 0; - file.readFully((size - readSize), - buffer.array(), buffer.arrayOffset(), readSize); - buffer.position(0); - - //read the PostScript - //get length of PostScript - int psLen = buffer.get(readSize - 1) & 0xff; - ensureOrcFooter(file, path, psLen, buffer); - int psOffset = readSize - 1 - psLen; - ps = extractPostScript(buffer, path, psLen, psOffset); - - int footerSize = (int) ps.getFooterLength(); - int metadataSize = (int) ps.getMetadataLength(); - writerVersion = extractWriterVersion(ps); - - //check if extra bytes need to be read - int extra = Math.max(0, psLen + 1 + footerSize + metadataSize - readSize); - if (extra > 0) { - //more bytes need to be read, seek back to the right place and read extra bytes - ByteBuffer extraBuf = ByteBuffer.allocate(extra + readSize); - file.readFully((size - readSize - extra), extraBuf.array(), - extraBuf.arrayOffset() + extraBuf.position(), extra); - extraBuf.position(extra); - //append with already read bytes - extraBuf.put(buffer); - buffer = extraBuf; - buffer.position(0); - fullFooterBuffer = buffer.slice(); - buffer.limit(footerSize + metadataSize); - } else { - //footer is already in the bytes in buffer, just adjust position, length - buffer.position(psOffset - footerSize - metadataSize); - fullFooterBuffer = buffer.slice(); - buffer.limit(psOffset); - } - - // remember position for later TODO: what later? this comment is useless - buffer.mark(); - } finally { - try { - file.close(); - } catch (IOException ex) { - LOG.error("Failed to close the file after another error", ex); + this.inspector = OrcStruct.createObjectInspector(0, types); + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("hdfs")) { + LOG.info("HDFS FileSystem counters after ORC reader creation: {}", statistics.toString()); } } - - return new FileMetaInfo( - ps.getCompression().toString(), - (int) ps.getCompressionBlockSize(), - (int) ps.getMetadataLength(), - buffer, - ps.getVersionList(), - writerVersion, - fullFooterBuffer - ); - } - - /** - * MetaInfoObjExtractor - has logic to create the values for the fields in ReaderImpl - * from serialized fields. - * As the fields are final, the fields need to be initialized in the constructor and - * can't be done in some helper function. So this helper class is used instead. - * - */ - private static class MetaInfoObjExtractor{ - final org.apache.orc.CompressionKind compressionKind; - final CompressionCodec codec; - final int bufferSize; - final int metadataSize; - final OrcProto.Metadata metadata; - final OrcProto.Footer footer; - final ObjectInspector inspector; - - MetaInfoObjExtractor(String codecStr, int bufferSize, int metadataSize, - ByteBuffer footerBuffer) throws IOException { - - this.compressionKind = org.apache.orc.CompressionKind.valueOf(codecStr.toUpperCase()); - this.bufferSize = bufferSize; - this.codec = WriterImpl.createCodec(compressionKind); - this.metadataSize = metadataSize; - - int position = footerBuffer.position(); - int footerBufferSize = footerBuffer.limit() - footerBuffer.position() - metadataSize; - - this.metadata = extractMetadata(footerBuffer, position, metadataSize, codec, bufferSize); - this.footer = extractFooter( - footerBuffer, position + metadataSize, footerBufferSize, codec, bufferSize); - - footerBuffer.position(position); - this.inspector = OrcStruct.createObjectInspector(0, footer.getTypesList()); - } - } - - public FileMetaInfo getFileMetaInfo() { - return new FileMetaInfo(compressionKind.toString(), bufferSize, - getMetadataSize(), footerByteBuffer, getVersionList(), - getWriterVersion(), footerMetaAndPsBuffer); - } - - /** Same as FileMetaInfo, but with extra fields. FileMetaInfo is serialized for splits - * and so we don't just add fields to it, it's already messy and confusing. */ - public static final class FooterInfo { - private final OrcProto.Footer footer; - private final List metadata; - private final List stripes; - private final FileMetaInfo fileMetaInfo; - - private FooterInfo( - List metadata, OrcProto.Footer footer, FileMetaInfo fileMetaInfo) { - this.metadata = metadata; - this.footer = footer; - this.fileMetaInfo = fileMetaInfo; - this.stripes = convertProtoStripesToStripes(footer.getStripesList()); - } - - public OrcProto.Footer getFooter() { - return footer; - } - - public List getMetadata() { - return metadata; - } - - public FileMetaInfo getFileMetaInfo() { - return fileMetaInfo; - } - - public List getStripes() { - return stripes; - } } @Override public ByteBuffer getSerializedFileFooter() { - return footerMetaAndPsBuffer; + return tail.getSerializedTail(); } @Override diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java index e4d2e6e..32ac34e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java @@ -182,8 +182,9 @@ public VectorizedOrcInputFormat() { if(fSplit instanceof OrcSplit){ OrcSplit orcSplit = (OrcSplit) fSplit; if (orcSplit.hasFooter()) { - opts.fileMetaInfo(orcSplit.getFileMetaInfo()); + opts.orcTail(orcSplit.getOrcTail()); } + opts.maxLength(orcSplit.getFileLength()); } Reader reader = OrcFile.createReader(path, opts); return new VectorizedOrcRecordReader(reader, conf, fSplit); diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java index edaecb3..c414801 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java @@ -649,7 +649,7 @@ public void testSplitGenReadOps() throws Exception { } // call-1: listLocatedStatus - mock:/mocktable // call-2: open - mock:/mocktable/0_0 - // call-3: open - mock:/mocktable/0_0 + // call-3: open - mock:/mocktable/0_1 assertEquals(3, readOpsDelta); assertEquals(2, splits.length); @@ -658,6 +658,704 @@ public void testSplitGenReadOps() throws Exception { } @Test + public void testSplitGenReadOpsLocalCache() throws Exception { + MockFileSystem fs = new MockFileSystem(conf); + MockPath mockPath = new MockPath(fs, "mock:///mocktbl"); + conf.set("hive.orc.cache.stripe.details.size", "-1"); + conf.set("mapred.input.dir", mockPath.toString()); + conf.set("fs.defaultFS", "mock:///"); + conf.set("fs.mock.impl", MockFileSystem.class.getName()); + StructObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = (StructObjectInspector) + ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, + ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + Writer writer = + OrcFile.createWriter(new Path(mockPath + "/0_0"), + OrcFile.writerOptions(conf).blockPadding(false) + .bufferSize(1024).inspector(inspector)); + for(int i=0; i < 10; ++i) { + writer.addRow(new MyRow(i, 2*i)); + } + writer.close(); + + writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), + OrcFile.writerOptions(conf).blockPadding(false) + .bufferSize(1024).inspector(inspector)); + for(int i=0; i < 10; ++i) { + writer.addRow(new MyRow(i, 2*i)); + } + writer.close(); + + int readOpsBefore = -1; + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsBefore = statistics.getReadOps(); + } + } + assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1); + OrcInputFormat orcInputFormat = new OrcInputFormat(); + InputSplit[] splits = orcInputFormat.getSplits(conf, 2); + assertEquals(2, splits.length); + int readOpsDelta = -1; + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsDelta = statistics.getReadOps() - readOpsBefore; + } + } + // call-1: listLocatedStatus - mock:/mocktable + // call-2: open - mock:/mocktable/0_0 + // call-3: open - mock:/mocktable/0_1 + assertEquals(3, readOpsDelta); + + // force BI to avoid reading footers + conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "BI"); + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsBefore = statistics.getReadOps(); + } + } + orcInputFormat = new OrcInputFormat(); + splits = orcInputFormat.getSplits(conf, 2); + assertEquals(2, splits.length); + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsDelta = statistics.getReadOps() - readOpsBefore; + } + } + // call-1: listLocatedStatus - mock:/mocktable + assertEquals(1, readOpsDelta); + + // enable cache and use default strategy + conf.set("hive.orc.cache.stripe.details.size", "100"); + conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "HYBRID"); + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsBefore = statistics.getReadOps(); + } + } + orcInputFormat = new OrcInputFormat(); + splits = orcInputFormat.getSplits(conf, 2); + assertEquals(2, splits.length); + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsDelta = statistics.getReadOps() - readOpsBefore; + } + } + // call-1: listLocatedStatus - mock:/mocktable + // call-2: open - mock:/mocktable/0_0 + // call-3: open - mock:/mocktable/0_1 + assertEquals(3, readOpsDelta); + + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsBefore = statistics.getReadOps(); + } + } + orcInputFormat = new OrcInputFormat(); + splits = orcInputFormat.getSplits(conf, 2); + assertEquals(2, splits.length); + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsDelta = statistics.getReadOps() - readOpsBefore; + } + } + // call-1: listLocatedStatus - mock:/mocktable + assertEquals(1, readOpsDelta); + + // revert back to local fs + conf.set("fs.defaultFS", "file:///"); + } + + @Test + public void testNonVectorReaderNoFooterSerialize() throws Exception { + MockFileSystem fs = new MockFileSystem(conf); + MockPath mockPath = new MockPath(fs, "mock:///mocktable1"); + conf.set("hive.orc.splits.include.file.footer", "false"); + conf.set("mapred.input.dir", mockPath.toString()); + conf.set("fs.defaultFS", "mock:///"); + conf.set("fs.mock.impl", MockFileSystem.class.getName()); + StructObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = (StructObjectInspector) + ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, + ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + Writer writer = + OrcFile.createWriter(new Path(mockPath + "/0_0"), + OrcFile.writerOptions(conf).blockPadding(false) + .bufferSize(1024).inspector(inspector)); + for(int i=0; i < 10; ++i) { + writer.addRow(new MyRow(i, 2*i)); + } + writer.close(); + + writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), + OrcFile.writerOptions(conf).blockPadding(false) + .bufferSize(1024).inspector(inspector)); + for(int i=0; i < 10; ++i) { + writer.addRow(new MyRow(i, 2*i)); + } + writer.close(); + + OrcInputFormat orcInputFormat = new OrcInputFormat(); + InputSplit[] splits = orcInputFormat.getSplits(conf, 2); + assertEquals(2, splits.length); + int readOpsBefore = -1; + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsBefore = statistics.getReadOps(); + } + } + assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1); + + for (InputSplit split : splits) { + assertTrue("OrcSplit is expected", split instanceof OrcSplit); + // ETL strategies will have start=3 (start of first stripe) + assertTrue(split.toString().contains("start=3")); + assertTrue(split.toString().contains("hasFooter=false")); + assertTrue(split.toString().contains("hasBase=true")); + assertTrue(split.toString().contains("deltas=0")); + if (split instanceof OrcSplit) { + assertFalse("No footer serialize test for non-vector reader, hasFooter is not expected in" + + " orc splits.", ((OrcSplit) split).hasFooter()); + } + orcInputFormat.getRecordReader(split, conf, null); + } + + int readOpsDelta = -1; + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsDelta = statistics.getReadOps() - readOpsBefore; + } + } + // call-1: open to read footer - split 1 => mock:/mocktable1/0_0 + // call-2: open to read data - split 1 => mock:/mocktable1/0_0 + // call-3: open to read footer - split 2 => mock:/mocktable1/0_1 + // call-4: open to read data - split 2 => mock:/mocktable1/0_1 + assertEquals(4, readOpsDelta); + + // revert back to local fs + conf.set("fs.defaultFS", "file:///"); + } + + @Test + public void testNonVectorReaderFooterSerialize() throws Exception { + MockFileSystem fs = new MockFileSystem(conf); + MockPath mockPath = new MockPath(fs, "mock:///mocktable2"); + conf.set("hive.orc.splits.include.file.footer", "true"); + conf.set("mapred.input.dir", mockPath.toString()); + conf.set("fs.defaultFS", "mock:///"); + conf.set("fs.mock.impl", MockFileSystem.class.getName()); + StructObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = (StructObjectInspector) + ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, + ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + Writer writer = + OrcFile.createWriter(new Path(mockPath + "/0_0"), + OrcFile.writerOptions(conf).blockPadding(false) + .bufferSize(1024).inspector(inspector)); + for(int i=0; i < 10; ++i) { + writer.addRow(new MyRow(i, 2*i)); + } + writer.close(); + + writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), + OrcFile.writerOptions(conf).blockPadding(false) + .bufferSize(1024).inspector(inspector)); + for(int i=0; i < 10; ++i) { + writer.addRow(new MyRow(i, 2*i)); + } + writer.close(); + + OrcInputFormat orcInputFormat = new OrcInputFormat(); + InputSplit[] splits = orcInputFormat.getSplits(conf, 2); + assertEquals(2, splits.length); + int readOpsBefore = -1; + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsBefore = statistics.getReadOps(); + } + } + assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1); + + for (InputSplit split : splits) { + assertTrue("OrcSplit is expected", split instanceof OrcSplit); + // ETL strategies will have start=3 (start of first stripe) + assertTrue(split.toString().contains("start=3")); + assertTrue(split.toString().contains("hasFooter=true")); + assertTrue(split.toString().contains("hasBase=true")); + assertTrue(split.toString().contains("deltas=0")); + if (split instanceof OrcSplit) { + assertTrue("Footer serialize test for non-vector reader, hasFooter is expected in" + + " orc splits.", ((OrcSplit) split).hasFooter()); + } + orcInputFormat.getRecordReader(split, conf, null); + } + + int readOpsDelta = -1; + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsDelta = statistics.getReadOps() - readOpsBefore; + } + } + // call-1: open to read data - split 1 => mock:/mocktable2/0_0 + // call-2: open to read data - split 2 => mock:/mocktable2/0_1 + assertEquals(2, readOpsDelta); + + // revert back to local fs + conf.set("fs.defaultFS", "file:///"); + } + + @Test + public void testVectorReaderNoFooterSerialize() throws Exception { + MockFileSystem fs = new MockFileSystem(conf); + MockPath mockPath = new MockPath(fs, "mock:///mocktable3"); + conf.set("hive.orc.splits.include.file.footer", "false"); + conf.set("mapred.input.dir", mockPath.toString()); + conf.set("fs.defaultFS", "mock:///"); + conf.set("fs.mock.impl", MockFileSystem.class.getName()); + StructObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = (StructObjectInspector) + ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, + ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + JobConf jobConf = createMockExecutionEnvironment(workDir, new Path("mock:///"), + "mocktable3", inspector, true, 0); + Writer writer = + OrcFile.createWriter(new Path(mockPath + "/0_0"), + OrcFile.writerOptions(conf).blockPadding(false) + .bufferSize(1024).inspector(inspector)); + for (int i = 0; i < 10; ++i) { + writer.addRow(new MyRow(i, 2 * i)); + } + writer.close(); + + writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), + OrcFile.writerOptions(conf).blockPadding(false) + .bufferSize(1024).inspector(inspector)); + for (int i = 0; i < 10; ++i) { + writer.addRow(new MyRow(i, 2 * i)); + } + writer.close(); + + OrcInputFormat orcInputFormat = new OrcInputFormat(); + InputSplit[] splits = orcInputFormat.getSplits(conf, 2); + assertEquals(2, splits.length); + + int readOpsBefore = -1; + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsBefore = statistics.getReadOps(); + } + } + assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1); + + for (InputSplit split : splits) { + assertTrue("OrcSplit is expected", split instanceof OrcSplit); + // ETL strategies will have start=3 (start of first stripe) + assertTrue(split.toString().contains("start=3")); + assertTrue(split.toString().contains("hasFooter=false")); + assertTrue(split.toString().contains("hasBase=true")); + assertTrue(split.toString().contains("deltas=0")); + if (split instanceof OrcSplit) { + assertFalse("No footer serialize test for vector reader, hasFooter is not expected in" + + " orc splits.", ((OrcSplit) split).hasFooter()); + } + orcInputFormat.getRecordReader(split, jobConf, Reporter.NULL); + } + + int readOpsDelta = -1; + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsDelta = statistics.getReadOps() - readOpsBefore; + } + } + // call-1: open to read footer - split 1 => mock:/mocktable3/0_0 + // call-2: open to read data - split 1 => mock:/mocktable3/0_0 + // call-3: open to read footer - split 2 => mock:/mocktable3/0_1 + // call-4: open to read data - split 2 => mock:/mocktable3/0_1 + assertEquals(4, readOpsDelta); + + // revert back to local fs + conf.set("fs.defaultFS", "file:///"); + } + + @Test + public void testVectorReaderFooterSerialize() throws Exception { + MockFileSystem fs = new MockFileSystem(conf); + MockPath mockPath = new MockPath(fs, "mock:///mocktable4"); + conf.set("hive.orc.splits.include.file.footer", "true"); + conf.set("mapred.input.dir", mockPath.toString()); + conf.set("fs.defaultFS", "mock:///"); + conf.set("fs.mock.impl", MockFileSystem.class.getName()); + StructObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = (StructObjectInspector) + ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, + ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + JobConf jobConf = createMockExecutionEnvironment(workDir, new Path("mock:///"), + "mocktable4", inspector, true, 0); + Writer writer = + OrcFile.createWriter(new Path(mockPath + "/0_0"), + OrcFile.writerOptions(conf).blockPadding(false) + .bufferSize(1024).inspector(inspector)); + for (int i = 0; i < 10; ++i) { + writer.addRow(new MyRow(i, 2 * i)); + } + writer.close(); + + writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), + OrcFile.writerOptions(conf).blockPadding(false) + .bufferSize(1024).inspector(inspector)); + for (int i = 0; i < 10; ++i) { + writer.addRow(new MyRow(i, 2 * i)); + } + writer.close(); + + OrcInputFormat orcInputFormat = new OrcInputFormat(); + InputSplit[] splits = orcInputFormat.getSplits(conf, 2); + assertEquals(2, splits.length); + + int readOpsBefore = -1; + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsBefore = statistics.getReadOps(); + } + } + assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1); + + for (InputSplit split : splits) { + assertTrue("OrcSplit is expected", split instanceof OrcSplit); + // ETL strategies will have start=3 (start of first stripe) + assertTrue(split.toString().contains("start=3")); + assertTrue(split.toString().contains("hasFooter=true")); + assertTrue(split.toString().contains("hasBase=true")); + assertTrue(split.toString().contains("deltas=0")); + if (split instanceof OrcSplit) { + assertTrue("Footer serialize test for vector reader, hasFooter is expected in" + + " orc splits.", ((OrcSplit) split).hasFooter()); + } + orcInputFormat.getRecordReader(split, jobConf, Reporter.NULL); + } + + int readOpsDelta = -1; + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsDelta = statistics.getReadOps() - readOpsBefore; + } + } + // call-1: open to read data - split 1 => mock:/mocktable4/0_0 + // call-2: open to read data - split 2 => mock:/mocktable4/0_1 + assertEquals(2, readOpsDelta); + + // revert back to local fs + conf.set("fs.defaultFS", "file:///"); + } + + @Test + public void testACIDReaderNoFooterSerialize() throws Exception { + MockFileSystem fs = new MockFileSystem(conf); + MockPath mockPath = new MockPath(fs, "mock:///mocktable5"); + conf.set("hive.transactional.table.scan", "true"); + conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty()); + conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty()); + conf.set("hive.orc.splits.include.file.footer", "false"); + conf.set("mapred.input.dir", mockPath.toString()); + conf.set("fs.defaultFS", "mock:///"); + conf.set("fs.mock.impl", MockFileSystem.class.getName()); + StructObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = (StructObjectInspector) + ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, + ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + Writer writer = + OrcFile.createWriter(new Path(mockPath + "/0_0"), + OrcFile.writerOptions(conf).blockPadding(false) + .bufferSize(1024).inspector(inspector)); + for(int i=0; i < 10; ++i) { + writer.addRow(new MyRow(i, 2*i)); + } + writer.close(); + + writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), + OrcFile.writerOptions(conf).blockPadding(false) + .bufferSize(1024).inspector(inspector)); + for(int i=0; i < 10; ++i) { + writer.addRow(new MyRow(i, 2*i)); + } + writer.close(); + + OrcInputFormat orcInputFormat = new OrcInputFormat(); + InputSplit[] splits = orcInputFormat.getSplits(conf, 2); + assertEquals(2, splits.length); + int readOpsBefore = -1; + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsBefore = statistics.getReadOps(); + } + } + assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1); + + for (InputSplit split : splits) { + assertTrue("OrcSplit is expected", split instanceof OrcSplit); + // ETL strategies will have start=3 (start of first stripe) + assertTrue(split.toString().contains("start=3")); + assertTrue(split.toString().contains("hasFooter=false")); + assertTrue(split.toString().contains("hasBase=true")); + assertTrue(split.toString().contains("deltas=0")); + if (split instanceof OrcSplit) { + assertFalse("No footer serialize test for non-vector reader, hasFooter is not expected in" + + " orc splits.", ((OrcSplit) split).hasFooter()); + } + orcInputFormat.getRecordReader(split, conf, Reporter.NULL); + } + + int readOpsDelta = -1; + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsDelta = statistics.getReadOps() - readOpsBefore; + } + } + // call-1: open to read footer - split 1 => mock:/mocktable5/0_0 + // call-2: open to read data - split 1 => mock:/mocktable5/0_0 + // call-3: open to read footer - split 2 => mock:/mocktable5/0_1 + // call-4: open to read data - split 2 => mock:/mocktable5/0_1 + assertEquals(4, readOpsDelta); + + // revert back to local fs + conf.set("fs.defaultFS", "file:///"); + } + + @Test + public void testACIDReaderFooterSerialize() throws Exception { + MockFileSystem fs = new MockFileSystem(conf); + MockPath mockPath = new MockPath(fs, "mock:///mocktable6"); + conf.set("hive.transactional.table.scan", "true"); + conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty()); + conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty()); + conf.set("hive.orc.splits.include.file.footer", "true"); + conf.set("mapred.input.dir", mockPath.toString()); + conf.set("fs.defaultFS", "mock:///"); + conf.set("fs.mock.impl", MockFileSystem.class.getName()); + StructObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = (StructObjectInspector) + ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, + ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + Writer writer = + OrcFile.createWriter(new Path(mockPath + "/0_0"), + OrcFile.writerOptions(conf).blockPadding(false) + .bufferSize(1024).inspector(inspector)); + for(int i=0; i < 10; ++i) { + writer.addRow(new MyRow(i, 2*i)); + } + writer.close(); + + writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), + OrcFile.writerOptions(conf).blockPadding(false) + .bufferSize(1024).inspector(inspector)); + for(int i=0; i < 10; ++i) { + writer.addRow(new MyRow(i, 2*i)); + } + writer.close(); + + OrcInputFormat orcInputFormat = new OrcInputFormat(); + InputSplit[] splits = orcInputFormat.getSplits(conf, 2); + assertEquals(2, splits.length); + int readOpsBefore = -1; + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsBefore = statistics.getReadOps(); + } + } + assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1); + + for (InputSplit split : splits) { + assertTrue("OrcSplit is expected", split instanceof OrcSplit); + // ETL strategies will have start=3 (start of first stripe) + assertTrue(split.toString().contains("start=3")); + assertTrue(split.toString().contains("hasFooter=true")); + assertTrue(split.toString().contains("hasBase=true")); + assertTrue(split.toString().contains("deltas=0")); + if (split instanceof OrcSplit) { + assertTrue("Footer serialize test for ACID reader, hasFooter is expected in" + + " orc splits.", ((OrcSplit) split).hasFooter()); + } + orcInputFormat.getRecordReader(split, conf, Reporter.NULL); + } + + int readOpsDelta = -1; + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsDelta = statistics.getReadOps() - readOpsBefore; + } + } + // call-1: open to read data - split 1 => mock:/mocktable6/0_0 + // call-2: open to read data - split 2 => mock:/mocktable6/0_1 + assertEquals(2, readOpsDelta); + + // revert back to local fs + conf.set("fs.defaultFS", "file:///"); + } + + @Test + public void testACIDReaderNoFooterSerializeWithDeltas() throws Exception { + MockFileSystem fs = new MockFileSystem(conf); + MockPath mockPath = new MockPath(fs, "mock:///mocktable7"); + conf.set("hive.transactional.table.scan", "true"); + conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty()); + conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty()); + conf.set("hive.orc.splits.include.file.footer", "false"); + conf.set("mapred.input.dir", mockPath.toString()); + conf.set("fs.defaultFS", "mock:///"); + conf.set("fs.mock.impl", MockFileSystem.class.getName()); + StructObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = (StructObjectInspector) + ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, + ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + Writer writer = + OrcFile.createWriter(new Path(mockPath + "/0_0"), + OrcFile.writerOptions(conf).blockPadding(false) + .bufferSize(1024).inspector(inspector)); + for(int i=0; i < 10; ++i) { + writer.addRow(new MyRow(i, 2*i)); + } + writer.close(); + + writer = OrcFile.createWriter(new Path(new Path(mockPath + "/delta_001_002") + "/0_1"), + OrcFile.writerOptions(conf).blockPadding(false) + .bufferSize(1024).inspector(inspector)); + for(int i=0; i < 10; ++i) { + writer.addRow(new MyRow(i, 2*i)); + } + writer.close(); + + OrcInputFormat orcInputFormat = new OrcInputFormat(); + InputSplit[] splits = orcInputFormat.getSplits(conf, 2); + assertEquals(1, splits.length); + int readOpsBefore = -1; + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsBefore = statistics.getReadOps(); + } + } + assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1); + + for (InputSplit split : splits) { + assertTrue("OrcSplit is expected", split instanceof OrcSplit); + // ETL strategies will have start=3 (start of first stripe) + assertTrue(split.toString().contains("start=3")); + assertTrue(split.toString().contains("hasFooter=false")); + assertTrue(split.toString().contains("hasBase=true")); + assertTrue(split.toString().contains("deltas=1")); + if (split instanceof OrcSplit) { + assertFalse("No footer serialize test for ACID reader, hasFooter is not expected in" + + " orc splits.", ((OrcSplit) split).hasFooter()); + } + orcInputFormat.getRecordReader(split, conf, Reporter.NULL); + } + + int readOpsDelta = -1; + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsDelta = statistics.getReadOps() - readOpsBefore; + } + } + // call-1: open to read footer - split 1 => mock:/mocktable7/0_0 + // call-2: open to read data - split 1 => mock:/mocktable7/0_0 + // call-3: open side file (flush length) of delta directory + // call-4: fs.exists() check for delta_xxx_xxx/bucket_00000 file + assertEquals(4, readOpsDelta); + + // revert back to local fs + conf.set("fs.defaultFS", "file:///"); + } + + @Test + public void testACIDReaderFooterSerializeWithDeltas() throws Exception { + MockFileSystem fs = new MockFileSystem(conf); + MockPath mockPath = new MockPath(fs, "mock:///mocktable8"); + conf.set("hive.transactional.table.scan", "true"); + conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty()); + conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty()); + conf.set("hive.orc.splits.include.file.footer", "true"); + conf.set("mapred.input.dir", mockPath.toString()); + conf.set("fs.defaultFS", "mock:///"); + conf.set("fs.mock.impl", MockFileSystem.class.getName()); + StructObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = (StructObjectInspector) + ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, + ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + Writer writer = + OrcFile.createWriter(new Path(mockPath + "/0_0"), + OrcFile.writerOptions(conf).blockPadding(false) + .bufferSize(1024).inspector(inspector)); + for(int i=0; i < 10; ++i) { + writer.addRow(new MyRow(i, 2*i)); + } + writer.close(); + + writer = OrcFile.createWriter(new Path(new Path(mockPath + "/delta_001_002") + "/0_1"), + OrcFile.writerOptions(conf).blockPadding(false) + .bufferSize(1024).inspector(inspector)); + for(int i=0; i < 10; ++i) { + writer.addRow(new MyRow(i, 2*i)); + } + writer.close(); + + OrcInputFormat orcInputFormat = new OrcInputFormat(); + InputSplit[] splits = orcInputFormat.getSplits(conf, 2); + assertEquals(1, splits.length); + int readOpsBefore = -1; + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsBefore = statistics.getReadOps(); + } + } + assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1); + + for (InputSplit split : splits) { + assertTrue("OrcSplit is expected", split instanceof OrcSplit); + // ETL strategies will have start=3 (start of first stripe) + assertTrue(split.toString().contains("start=3")); + assertTrue(split.toString().contains("hasFooter=true")); + assertTrue(split.toString().contains("hasBase=true")); + assertTrue(split.toString().contains("deltas=1")); + if (split instanceof OrcSplit) { + assertTrue("Footer serialize test for ACID reader, hasFooter is not expected in" + + " orc splits.", ((OrcSplit) split).hasFooter()); + } + orcInputFormat.getRecordReader(split, conf, Reporter.NULL); + } + + int readOpsDelta = -1; + for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { + if (statistics.getScheme().equalsIgnoreCase("mock")) { + readOpsDelta = statistics.getReadOps() - readOpsBefore; + } + } + // call-1: open to read data - split 1 => mock:/mocktable8/0_0 + // call-2: open side file (flush length) of delta directory + // call-3: fs.exists() check for delta_xxx_xxx/bucket_00000 file + assertEquals(3, readOpsDelta); + + // revert back to local fs + conf.set("fs.defaultFS", "file:///"); + } + + @Test public void testBIStrategySplitBlockBoundary() throws Exception { conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "BI"); OrcInputFormat.Context context = new OrcInputFormat.Context(conf);