+ implements org.apache.hadoop.hive.ql.io.orc.OrcProto.FileTailOrBuilder {
+ public static final com.google.protobuf.Descriptors.Descriptor
+ getDescriptor() {
+ return org.apache.hadoop.hive.ql.io.orc.OrcProto.internal_static_orc_proto_FileTail_descriptor;
+ }
+
+ protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
+ internalGetFieldAccessorTable() {
+ return org.apache.hadoop.hive.ql.io.orc.OrcProto.internal_static_orc_proto_FileTail_fieldAccessorTable
+ .ensureFieldAccessorsInitialized(
+ org.apache.hadoop.hive.ql.io.orc.OrcProto.FileTail.class, org.apache.hadoop.hive.ql.io.orc.OrcProto.FileTail.Builder.class);
+ }
+
+ // Construct using org.apache.hadoop.hive.ql.io.orc.OrcProto.FileTail.newBuilder()
+ private Builder() {
+ maybeForceBuilderInitialization();
+ }
+
+ private Builder(
+ com.google.protobuf.GeneratedMessage.BuilderParent parent) {
+ super(parent);
+ maybeForceBuilderInitialization();
+ }
+ private void maybeForceBuilderInitialization() {
+ if (com.google.protobuf.GeneratedMessage.alwaysUseFieldBuilders) {
+ getPostscriptFieldBuilder();
+ getFooterFieldBuilder();
+ }
+ }
+ private static Builder create() {
+ return new Builder();
+ }
+
+ public Builder clear() {
+ super.clear();
+ if (postscriptBuilder_ == null) {
+ postscript_ = org.apache.hadoop.hive.ql.io.orc.OrcProto.PostScript.getDefaultInstance();
+ } else {
+ postscriptBuilder_.clear();
+ }
+ bitField0_ = (bitField0_ & ~0x00000001);
+ if (footerBuilder_ == null) {
+ footer_ = org.apache.hadoop.hive.ql.io.orc.OrcProto.Footer.getDefaultInstance();
+ } else {
+ footerBuilder_.clear();
+ }
+ bitField0_ = (bitField0_ & ~0x00000002);
+ fileLength_ = 0L;
+ bitField0_ = (bitField0_ & ~0x00000004);
+ postscriptLength_ = 0L;
+ bitField0_ = (bitField0_ & ~0x00000008);
+ return this;
+ }
+
+ public Builder clone() {
+ return create().mergeFrom(buildPartial());
+ }
+
+ public com.google.protobuf.Descriptors.Descriptor
+ getDescriptorForType() {
+ return org.apache.hadoop.hive.ql.io.orc.OrcProto.internal_static_orc_proto_FileTail_descriptor;
+ }
+
+ public org.apache.hadoop.hive.ql.io.orc.OrcProto.FileTail getDefaultInstanceForType() {
+ return org.apache.hadoop.hive.ql.io.orc.OrcProto.FileTail.getDefaultInstance();
+ }
+
+ public org.apache.hadoop.hive.ql.io.orc.OrcProto.FileTail build() {
+ org.apache.hadoop.hive.ql.io.orc.OrcProto.FileTail result = buildPartial();
+ if (!result.isInitialized()) {
+ throw newUninitializedMessageException(result);
+ }
+ return result;
+ }
+
+ public org.apache.hadoop.hive.ql.io.orc.OrcProto.FileTail buildPartial() {
+ org.apache.hadoop.hive.ql.io.orc.OrcProto.FileTail result = new org.apache.hadoop.hive.ql.io.orc.OrcProto.FileTail(this);
+ int from_bitField0_ = bitField0_;
+ int to_bitField0_ = 0;
+ if (((from_bitField0_ & 0x00000001) == 0x00000001)) {
+ to_bitField0_ |= 0x00000001;
+ }
+ if (postscriptBuilder_ == null) {
+ result.postscript_ = postscript_;
+ } else {
+ result.postscript_ = postscriptBuilder_.build();
+ }
+ if (((from_bitField0_ & 0x00000002) == 0x00000002)) {
+ to_bitField0_ |= 0x00000002;
+ }
+ if (footerBuilder_ == null) {
+ result.footer_ = footer_;
+ } else {
+ result.footer_ = footerBuilder_.build();
+ }
+ if (((from_bitField0_ & 0x00000004) == 0x00000004)) {
+ to_bitField0_ |= 0x00000004;
+ }
+ result.fileLength_ = fileLength_;
+ if (((from_bitField0_ & 0x00000008) == 0x00000008)) {
+ to_bitField0_ |= 0x00000008;
+ }
+ result.postscriptLength_ = postscriptLength_;
+ result.bitField0_ = to_bitField0_;
+ onBuilt();
+ return result;
+ }
+
+ public Builder mergeFrom(com.google.protobuf.Message other) {
+ if (other instanceof org.apache.hadoop.hive.ql.io.orc.OrcProto.FileTail) {
+ return mergeFrom((org.apache.hadoop.hive.ql.io.orc.OrcProto.FileTail)other);
+ } else {
+ super.mergeFrom(other);
+ return this;
+ }
+ }
+
+ public Builder mergeFrom(org.apache.hadoop.hive.ql.io.orc.OrcProto.FileTail other) {
+ if (other == org.apache.hadoop.hive.ql.io.orc.OrcProto.FileTail.getDefaultInstance()) return this;
+ if (other.hasPostscript()) {
+ mergePostscript(other.getPostscript());
+ }
+ if (other.hasFooter()) {
+ mergeFooter(other.getFooter());
+ }
+ if (other.hasFileLength()) {
+ setFileLength(other.getFileLength());
+ }
+ if (other.hasPostscriptLength()) {
+ setPostscriptLength(other.getPostscriptLength());
+ }
+ this.mergeUnknownFields(other.getUnknownFields());
+ return this;
+ }
+
+ public final boolean isInitialized() {
+ return true;
+ }
+
+ public Builder mergeFrom(
+ com.google.protobuf.CodedInputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ org.apache.hadoop.hive.ql.io.orc.OrcProto.FileTail parsedMessage = null;
+ try {
+ parsedMessage = PARSER.parsePartialFrom(input, extensionRegistry);
+ } catch (com.google.protobuf.InvalidProtocolBufferException e) {
+ parsedMessage = (org.apache.hadoop.hive.ql.io.orc.OrcProto.FileTail) e.getUnfinishedMessage();
+ throw e;
+ } finally {
+ if (parsedMessage != null) {
+ mergeFrom(parsedMessage);
+ }
+ }
+ return this;
+ }
+ private int bitField0_;
+
+ // optional .orc.proto.PostScript postscript = 1;
+ private org.apache.hadoop.hive.ql.io.orc.OrcProto.PostScript postscript_ = org.apache.hadoop.hive.ql.io.orc.OrcProto.PostScript.getDefaultInstance();
+ private com.google.protobuf.SingleFieldBuilder<
+ org.apache.hadoop.hive.ql.io.orc.OrcProto.PostScript, org.apache.hadoop.hive.ql.io.orc.OrcProto.PostScript.Builder, org.apache.hadoop.hive.ql.io.orc.OrcProto.PostScriptOrBuilder> postscriptBuilder_;
+ /**
+ * optional .orc.proto.PostScript postscript = 1;
+ */
+ public boolean hasPostscript() {
+ return ((bitField0_ & 0x00000001) == 0x00000001);
+ }
+ /**
+ * optional .orc.proto.PostScript postscript = 1;
+ */
+ public org.apache.hadoop.hive.ql.io.orc.OrcProto.PostScript getPostscript() {
+ if (postscriptBuilder_ == null) {
+ return postscript_;
+ } else {
+ return postscriptBuilder_.getMessage();
+ }
+ }
+ /**
+ * optional .orc.proto.PostScript postscript = 1;
+ */
+ public Builder setPostscript(org.apache.hadoop.hive.ql.io.orc.OrcProto.PostScript value) {
+ if (postscriptBuilder_ == null) {
+ if (value == null) {
+ throw new NullPointerException();
+ }
+ postscript_ = value;
+ onChanged();
+ } else {
+ postscriptBuilder_.setMessage(value);
+ }
+ bitField0_ |= 0x00000001;
+ return this;
+ }
+ /**
+ * optional .orc.proto.PostScript postscript = 1;
+ */
+ public Builder setPostscript(
+ org.apache.hadoop.hive.ql.io.orc.OrcProto.PostScript.Builder builderForValue) {
+ if (postscriptBuilder_ == null) {
+ postscript_ = builderForValue.build();
+ onChanged();
+ } else {
+ postscriptBuilder_.setMessage(builderForValue.build());
+ }
+ bitField0_ |= 0x00000001;
+ return this;
+ }
+ /**
+ * optional .orc.proto.PostScript postscript = 1;
+ */
+ public Builder mergePostscript(org.apache.hadoop.hive.ql.io.orc.OrcProto.PostScript value) {
+ if (postscriptBuilder_ == null) {
+ if (((bitField0_ & 0x00000001) == 0x00000001) &&
+ postscript_ != org.apache.hadoop.hive.ql.io.orc.OrcProto.PostScript.getDefaultInstance()) {
+ postscript_ =
+ org.apache.hadoop.hive.ql.io.orc.OrcProto.PostScript.newBuilder(postscript_).mergeFrom(value).buildPartial();
+ } else {
+ postscript_ = value;
+ }
+ onChanged();
+ } else {
+ postscriptBuilder_.mergeFrom(value);
+ }
+ bitField0_ |= 0x00000001;
+ return this;
+ }
+ /**
+ * optional .orc.proto.PostScript postscript = 1;
+ */
+ public Builder clearPostscript() {
+ if (postscriptBuilder_ == null) {
+ postscript_ = org.apache.hadoop.hive.ql.io.orc.OrcProto.PostScript.getDefaultInstance();
+ onChanged();
+ } else {
+ postscriptBuilder_.clear();
+ }
+ bitField0_ = (bitField0_ & ~0x00000001);
+ return this;
+ }
+ /**
+ * optional .orc.proto.PostScript postscript = 1;
+ */
+ public org.apache.hadoop.hive.ql.io.orc.OrcProto.PostScript.Builder getPostscriptBuilder() {
+ bitField0_ |= 0x00000001;
+ onChanged();
+ return getPostscriptFieldBuilder().getBuilder();
+ }
+ /**
+ * optional .orc.proto.PostScript postscript = 1;
+ */
+ public org.apache.hadoop.hive.ql.io.orc.OrcProto.PostScriptOrBuilder getPostscriptOrBuilder() {
+ if (postscriptBuilder_ != null) {
+ return postscriptBuilder_.getMessageOrBuilder();
+ } else {
+ return postscript_;
+ }
+ }
+ /**
+ * optional .orc.proto.PostScript postscript = 1;
+ */
+ private com.google.protobuf.SingleFieldBuilder<
+ org.apache.hadoop.hive.ql.io.orc.OrcProto.PostScript, org.apache.hadoop.hive.ql.io.orc.OrcProto.PostScript.Builder, org.apache.hadoop.hive.ql.io.orc.OrcProto.PostScriptOrBuilder>
+ getPostscriptFieldBuilder() {
+ if (postscriptBuilder_ == null) {
+ postscriptBuilder_ = new com.google.protobuf.SingleFieldBuilder<
+ org.apache.hadoop.hive.ql.io.orc.OrcProto.PostScript, org.apache.hadoop.hive.ql.io.orc.OrcProto.PostScript.Builder, org.apache.hadoop.hive.ql.io.orc.OrcProto.PostScriptOrBuilder>(
+ postscript_,
+ getParentForChildren(),
+ isClean());
+ postscript_ = null;
+ }
+ return postscriptBuilder_;
+ }
+
+ // optional .orc.proto.Footer footer = 2;
+ private org.apache.hadoop.hive.ql.io.orc.OrcProto.Footer footer_ = org.apache.hadoop.hive.ql.io.orc.OrcProto.Footer.getDefaultInstance();
+ private com.google.protobuf.SingleFieldBuilder<
+ org.apache.hadoop.hive.ql.io.orc.OrcProto.Footer, org.apache.hadoop.hive.ql.io.orc.OrcProto.Footer.Builder, org.apache.hadoop.hive.ql.io.orc.OrcProto.FooterOrBuilder> footerBuilder_;
+ /**
+ * optional .orc.proto.Footer footer = 2;
+ */
+ public boolean hasFooter() {
+ return ((bitField0_ & 0x00000002) == 0x00000002);
+ }
+ /**
+ * optional .orc.proto.Footer footer = 2;
+ */
+ public org.apache.hadoop.hive.ql.io.orc.OrcProto.Footer getFooter() {
+ if (footerBuilder_ == null) {
+ return footer_;
+ } else {
+ return footerBuilder_.getMessage();
+ }
+ }
+ /**
+ * optional .orc.proto.Footer footer = 2;
+ */
+ public Builder setFooter(org.apache.hadoop.hive.ql.io.orc.OrcProto.Footer value) {
+ if (footerBuilder_ == null) {
+ if (value == null) {
+ throw new NullPointerException();
+ }
+ footer_ = value;
+ onChanged();
+ } else {
+ footerBuilder_.setMessage(value);
+ }
+ bitField0_ |= 0x00000002;
+ return this;
+ }
+ /**
+ * optional .orc.proto.Footer footer = 2;
+ */
+ public Builder setFooter(
+ org.apache.hadoop.hive.ql.io.orc.OrcProto.Footer.Builder builderForValue) {
+ if (footerBuilder_ == null) {
+ footer_ = builderForValue.build();
+ onChanged();
+ } else {
+ footerBuilder_.setMessage(builderForValue.build());
+ }
+ bitField0_ |= 0x00000002;
+ return this;
+ }
+ /**
+ * optional .orc.proto.Footer footer = 2;
+ */
+ public Builder mergeFooter(org.apache.hadoop.hive.ql.io.orc.OrcProto.Footer value) {
+ if (footerBuilder_ == null) {
+ if (((bitField0_ & 0x00000002) == 0x00000002) &&
+ footer_ != org.apache.hadoop.hive.ql.io.orc.OrcProto.Footer.getDefaultInstance()) {
+ footer_ =
+ org.apache.hadoop.hive.ql.io.orc.OrcProto.Footer.newBuilder(footer_).mergeFrom(value).buildPartial();
+ } else {
+ footer_ = value;
+ }
+ onChanged();
+ } else {
+ footerBuilder_.mergeFrom(value);
+ }
+ bitField0_ |= 0x00000002;
+ return this;
+ }
+ /**
+ * optional .orc.proto.Footer footer = 2;
+ */
+ public Builder clearFooter() {
+ if (footerBuilder_ == null) {
+ footer_ = org.apache.hadoop.hive.ql.io.orc.OrcProto.Footer.getDefaultInstance();
+ onChanged();
+ } else {
+ footerBuilder_.clear();
+ }
+ bitField0_ = (bitField0_ & ~0x00000002);
+ return this;
+ }
+ /**
+ * optional .orc.proto.Footer footer = 2;
+ */
+ public org.apache.hadoop.hive.ql.io.orc.OrcProto.Footer.Builder getFooterBuilder() {
+ bitField0_ |= 0x00000002;
+ onChanged();
+ return getFooterFieldBuilder().getBuilder();
+ }
+ /**
+ * optional .orc.proto.Footer footer = 2;
+ */
+ public org.apache.hadoop.hive.ql.io.orc.OrcProto.FooterOrBuilder getFooterOrBuilder() {
+ if (footerBuilder_ != null) {
+ return footerBuilder_.getMessageOrBuilder();
+ } else {
+ return footer_;
+ }
+ }
+ /**
+ * optional .orc.proto.Footer footer = 2;
+ */
+ private com.google.protobuf.SingleFieldBuilder<
+ org.apache.hadoop.hive.ql.io.orc.OrcProto.Footer, org.apache.hadoop.hive.ql.io.orc.OrcProto.Footer.Builder, org.apache.hadoop.hive.ql.io.orc.OrcProto.FooterOrBuilder>
+ getFooterFieldBuilder() {
+ if (footerBuilder_ == null) {
+ footerBuilder_ = new com.google.protobuf.SingleFieldBuilder<
+ org.apache.hadoop.hive.ql.io.orc.OrcProto.Footer, org.apache.hadoop.hive.ql.io.orc.OrcProto.Footer.Builder, org.apache.hadoop.hive.ql.io.orc.OrcProto.FooterOrBuilder>(
+ footer_,
+ getParentForChildren(),
+ isClean());
+ footer_ = null;
+ }
+ return footerBuilder_;
+ }
+
+ // optional uint64 fileLength = 3;
+ private long fileLength_ ;
+ /**
+ * optional uint64 fileLength = 3;
+ */
+ public boolean hasFileLength() {
+ return ((bitField0_ & 0x00000004) == 0x00000004);
+ }
+ /**
+ * optional uint64 fileLength = 3;
+ */
+ public long getFileLength() {
+ return fileLength_;
+ }
+ /**
+ * optional uint64 fileLength = 3;
+ */
+ public Builder setFileLength(long value) {
+ bitField0_ |= 0x00000004;
+ fileLength_ = value;
+ onChanged();
+ return this;
+ }
+ /**
+ * optional uint64 fileLength = 3;
+ */
+ public Builder clearFileLength() {
+ bitField0_ = (bitField0_ & ~0x00000004);
+ fileLength_ = 0L;
+ onChanged();
+ return this;
+ }
+
+ // optional uint64 postscriptLength = 4;
+ private long postscriptLength_ ;
+ /**
+ * optional uint64 postscriptLength = 4;
+ */
+ public boolean hasPostscriptLength() {
+ return ((bitField0_ & 0x00000008) == 0x00000008);
+ }
+ /**
+ * optional uint64 postscriptLength = 4;
+ */
+ public long getPostscriptLength() {
+ return postscriptLength_;
+ }
+ /**
+ * optional uint64 postscriptLength = 4;
+ */
+ public Builder setPostscriptLength(long value) {
+ bitField0_ |= 0x00000008;
+ postscriptLength_ = value;
+ onChanged();
+ return this;
+ }
+ /**
+ * optional uint64 postscriptLength = 4;
+ */
+ public Builder clearPostscriptLength() {
+ bitField0_ = (bitField0_ & ~0x00000008);
+ postscriptLength_ = 0L;
+ onChanged();
+ return this;
+ }
+
+ // @@protoc_insertion_point(builder_scope:orc.proto.FileTail)
+ }
+
+ static {
+ defaultInstance = new FileTail(true);
+ defaultInstance.initFields();
+ }
+
+ // @@protoc_insertion_point(class_scope:orc.proto.FileTail)
+ }
+
private static com.google.protobuf.Descriptors.Descriptor
internal_static_orc_proto_IntegerStatistics_descriptor;
private static
@@ -19055,6 +19924,11 @@ public Builder setMagicBytes(
private static
com.google.protobuf.GeneratedMessage.FieldAccessorTable
internal_static_orc_proto_PostScript_fieldAccessorTable;
+ private static com.google.protobuf.Descriptors.Descriptor
+ internal_static_orc_proto_FileTail_descriptor;
+ private static
+ com.google.protobuf.GeneratedMessage.FieldAccessorTable
+ internal_static_orc_proto_FileTail_fieldAccessorTable;
public static com.google.protobuf.Descriptors.FileDescriptor
getDescriptor() {
@@ -19135,10 +20009,13 @@ public Builder setMagicBytes(
"ression\030\002 \001(\0162\032.orc.proto.CompressionKin" +
"d\022\034\n\024compressionBlockSize\030\003 \001(\004\022\023\n\007versi",
"on\030\004 \003(\rB\002\020\001\022\026\n\016metadataLength\030\005 \001(\004\022\025\n\r" +
- "writerVersion\030\006 \001(\r\022\016\n\005magic\030\300> \001(\t*:\n\017C" +
- "ompressionKind\022\010\n\004NONE\020\000\022\010\n\004ZLIB\020\001\022\n\n\006SN" +
- "APPY\020\002\022\007\n\003LZO\020\003B\"\n org.apache.hadoop.hiv" +
- "e.ql.io.orc"
+ "writerVersion\030\006 \001(\r\022\016\n\005magic\030\300> \001(\t\"\206\001\n\010" +
+ "FileTail\022)\n\npostscript\030\001 \001(\0132\025.orc.proto" +
+ ".PostScript\022!\n\006footer\030\002 \001(\0132\021.orc.proto." +
+ "Footer\022\022\n\nfileLength\030\003 \001(\004\022\030\n\020postscript" +
+ "Length\030\004 \001(\004*:\n\017CompressionKind\022\010\n\004NONE\020" +
+ "\000\022\010\n\004ZLIB\020\001\022\n\n\006SNAPPY\020\002\022\007\n\003LZO\020\003B\"\n org." +
+ "apache.hadoop.hive.ql.io.orc"
};
com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner =
new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() {
@@ -19283,6 +20160,12 @@ public Builder setMagicBytes(
com.google.protobuf.GeneratedMessage.FieldAccessorTable(
internal_static_orc_proto_PostScript_descriptor,
new java.lang.String[] { "FooterLength", "Compression", "CompressionBlockSize", "Version", "MetadataLength", "WriterVersion", "Magic", });
+ internal_static_orc_proto_FileTail_descriptor =
+ getDescriptor().getMessageTypes().get(23);
+ internal_static_orc_proto_FileTail_fieldAccessorTable = new
+ com.google.protobuf.GeneratedMessage.FieldAccessorTable(
+ internal_static_orc_proto_FileTail_descriptor,
+ new java.lang.String[] { "Postscript", "Footer", "FileLength", "PostscriptLength", });
return null;
}
};
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java
index 3ca0a6e..906eb6b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java
@@ -194,14 +194,14 @@ public static Reader createReader(FileSystem fs, Path path
public static class ReaderOptions {
private final Configuration conf;
private FileSystem filesystem;
- private ReaderImpl.FileMetaInfo fileMetaInfo;
private long maxLength = Long.MAX_VALUE;
+ private OrcTail orcTail;
public ReaderOptions(Configuration conf) {
this.conf = conf;
}
- ReaderOptions fileMetaInfo(ReaderImpl.FileMetaInfo info) {
- fileMetaInfo = info;
+ ReaderOptions orcTail(OrcTail orcTail) {
+ this.orcTail = orcTail;
return this;
}
@@ -223,8 +223,8 @@ FileSystem getFilesystem() {
return filesystem;
}
- ReaderImpl.FileMetaInfo getFileMetaInfo() {
- return fileMetaInfo;
+ OrcTail getOrcTail() {
+ return orcTail;
}
long getMaxLength() {
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index 64abe91..35469d1 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -78,10 +78,12 @@
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;
+import com.google.common.annotations.VisibleForTesting;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.common.collect.Lists;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
+
/**
* A MapReduce/Hive input format for ORC files.
*
@@ -127,6 +129,7 @@
private static final long DEFAULT_MIN_SPLIT_SIZE = 16 * 1024 * 1024;
private static final long DEFAULT_MAX_SPLIT_SIZE = 256 * 1024 * 1024;
private static final int DEFAULT_ETL_FILE_THRESHOLD = 100;
+ private static final int DEFAULT_CACHE_INITIAL_CAPACITY = 1024;
private static final PerfLogger perfLogger = PerfLogger.getPerfLogger();
private static final String CLASS_NAME = ReaderImpl.class.getName();
@@ -430,7 +433,7 @@ public boolean validateInput(FileSystem fs, HiveConf conf,
*/
static class Context {
private final Configuration conf;
- private static Cache footerCache;
+ private static Cache footerCache;
private static ExecutorService threadPool = null;
private final int numBuckets;
private final long maxSize;
@@ -467,6 +470,8 @@ public boolean validateInput(FileSystem fs, HiveConf conf,
ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_SIZE);
int numThreads = HiveConf.getIntVar(conf,
ConfVars.HIVE_ORC_COMPUTE_SPLITS_NUM_THREADS);
+ boolean useSoftReference = HiveConf.getBoolVar(conf,
+ ConfVars.HIVE_ORC_CACHE_USE_SOFT_REFERENCES);
cacheStripeDetails = (cacheStripeDetailsSize > 0);
@@ -480,12 +485,14 @@ public boolean validateInput(FileSystem fs, HiveConf conf,
}
if (footerCache == null && cacheStripeDetails) {
- footerCache = CacheBuilder.newBuilder()
+ CacheBuilder builder = CacheBuilder.newBuilder()
+ .initialCapacity(DEFAULT_CACHE_INITIAL_CAPACITY)
.concurrencyLevel(numThreads)
- .initialCapacity(cacheStripeDetailsSize)
- .maximumSize(cacheStripeDetailsSize)
- .softValues()
- .build();
+ .maximumSize(cacheStripeDetailsSize);
+ if (useSoftReference) {
+ builder = builder.softValues();
+ }
+ footerCache = builder.build();
}
}
String value = conf.get(ValidTxnList.VALID_TXNS_KEY,
@@ -502,13 +509,13 @@ public boolean validateInput(FileSystem fs, HiveConf conf,
private final Context context;
private final FileSystem fs;
private final FileStatus file;
- private final FileInfo fileInfo;
+ private final OrcTail orcTail;
private final boolean isOriginal;
private final List deltas;
private final boolean hasBase;
SplitInfo(Context context, FileSystem fs,
- FileStatus file, FileInfo fileInfo,
+ FileStatus file, OrcTail orcTail,
boolean isOriginal,
List deltas,
boolean hasBase, Path dir, boolean[] covered) throws IOException {
@@ -516,7 +523,7 @@ public boolean validateInput(FileSystem fs, HiveConf conf,
this.context = context;
this.fs = fs;
this.file = file;
- this.fileInfo = fileInfo;
+ this.orcTail = orcTail;
this.isOriginal = isOriginal;
this.deltas = deltas;
this.hasBase = hasBase;
@@ -547,26 +554,26 @@ public ETLSplitStrategy(Context context, FileSystem fs, Path dir, List getSplits() throws IOException {
List result = Lists.newArrayList();
for (FileStatus file : files) {
- FileInfo info = null;
+ OrcTail orcTail = null;
if (context.cacheStripeDetails) {
- info = verifyCachedFileInfo(file);
+ orcTail = verifyCachedOrcTail(file);
}
// ignore files of 0 length
if (file.getLen() > 0) {
- result.add(new SplitInfo(context, fs, file, info, isOriginal, deltas, true, dir, covered));
+ result.add(new SplitInfo(context, fs, file, orcTail, isOriginal, deltas, true, dir, covered));
}
}
return result;
@@ -630,7 +637,7 @@ public BISplitStrategy(Context context, FileSystem fs,
for (Map.Entry entry : blockOffsets.entrySet()) {
OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), entry.getKey(),
entry.getValue().getLength(), entry.getValue().getHosts(), null, isOriginal, true,
- deltas, -1);
+ deltas, -1, fileStatus.getLen());
splits.add(orcSplit);
}
}
@@ -671,7 +678,7 @@ public ACIDSplitStrategy(Path dir, int numBuckets, List deltas, b
if (!deltas.isEmpty()) {
for (int b = 0; b < numBuckets; ++b) {
if (!covered[b]) {
- splits.add(new OrcSplit(dir, b, 0, new String[0], null, false, false, deltas, -1));
+ splits.add(new OrcSplit(dir, b, 0, new String[0], null, false, false, deltas, -1, -1));
}
}
}
@@ -775,12 +782,11 @@ public SplitStrategy call() throws IOException {
private final Context context;
private final FileSystem fs;
private final FileStatus file;
+ private OrcTail orcTail;
private final long blockSize;
private final TreeMap locations;
- private final FileInfo fileInfo;
private List stripes;
- private ReaderImpl.FileMetaInfo fileMetaInfo;
- private Metadata metadata;
+ private List stripeStats;
private List types;
private boolean[] includedCols;
private final boolean isOriginal;
@@ -795,7 +801,7 @@ public SplitGenerator(SplitInfo splitInfo) throws IOException {
this.fs = splitInfo.fs;
this.file = splitInfo.file;
this.blockSize = file.getBlockSize();
- this.fileInfo = splitInfo.fileInfo;
+ this.orcTail = splitInfo.orcTail;
locations = SHIMS.getLocationsWithOffset(fs, file);
this.isOriginal = splitInfo.isOriginal;
this.deltas = splitInfo.deltas;
@@ -839,11 +845,11 @@ static long getOverlap(long offset1, long length1,
* are written with large block sizes.
* @param offset the start of the split
* @param length the length of the split
- * @param fileMetaInfo file metadata from footer and postscript
+ * @param orcTail orc file tail
* @throws IOException
*/
OrcSplit createSplit(long offset, long length,
- ReaderImpl.FileMetaInfo fileMetaInfo) throws IOException {
+ OrcTail orcTail) throws IOException {
String[] hosts;
Map.Entry startEntry = locations.floorEntry(offset);
BlockLocation start = startEntry.getValue();
@@ -902,8 +908,8 @@ OrcSplit createSplit(long offset, long length,
final double splitRatio = (double) length / (double) fileLen;
final long scaledProjSize = projColsUncompressedSize > 0 ?
(long) (splitRatio * projColsUncompressedSize) : fileLen;
- return new OrcSplit(file.getPath(), offset, length, hosts, fileMetaInfo,
- isOriginal, hasBase, deltas, scaledProjSize);
+ return new OrcSplit(file.getPath(), offset, length, hosts, orcTail,
+ isOriginal, hasBase, deltas, scaledProjSize, fileLen);
}
/**
@@ -929,7 +935,6 @@ OrcSplit createSplit(long offset, long length,
writerVersion != OrcFile.WriterVersion.ORIGINAL) {
SearchArgument sarg = options.getSearchArgument();
List sargLeaves = sarg.getLeaves();
- List stripeStats = metadata.getStripeStatistics();
int[] filterColumns = RecordReaderImpl.mapSargColumns(sargLeaves,
options.getColumnNames(), getRootColumn(isOriginal));
@@ -965,7 +970,7 @@ OrcSplit createSplit(long offset, long length,
if (!includeStripe[idx]) {
// create split for the previous unfinished stripe
if (currentOffset != -1) {
- splits.add(createSplit(currentOffset, currentLength, fileMetaInfo));
+ splits.add(createSplit(currentOffset, currentLength, orcTail));
currentOffset = -1;
}
continue;
@@ -975,7 +980,7 @@ OrcSplit createSplit(long offset, long length,
// crossed a block boundary, cut the input split here.
if (currentOffset != -1 && currentLength > context.minSize &&
(currentOffset / blockSize != stripe.getOffset() / blockSize)) {
- splits.add(createSplit(currentOffset, currentLength, fileMetaInfo));
+ splits.add(createSplit(currentOffset, currentLength, orcTail));
currentOffset = -1;
}
// if we aren't building a split, start a new one.
@@ -987,12 +992,12 @@ OrcSplit createSplit(long offset, long length,
(stripe.getOffset() + stripe.getLength()) - currentOffset;
}
if (currentLength >= context.maxSize) {
- splits.add(createSplit(currentOffset, currentLength, fileMetaInfo));
+ splits.add(createSplit(currentOffset, currentLength, orcTail));
currentOffset = -1;
}
}
if (currentOffset != -1) {
- splits.add(createSplit(currentOffset, currentLength, fileMetaInfo));
+ splits.add(createSplit(currentOffset, currentLength, orcTail));
}
// add uncovered ACID delta splits
@@ -1001,41 +1006,34 @@ OrcSplit createSplit(long offset, long length,
}
private void populateAndCacheStripeDetails() throws IOException {
- Reader orcReader = OrcFile.createReader(file.getPath(),
- OrcFile.readerOptions(context.conf).filesystem(fs).maxLength(file.getLen()));
- if (fileInfo != null) {
- stripes = fileInfo.stripeInfos;
- fileMetaInfo = fileInfo.fileMetaInfo;
- metadata = fileInfo.metadata;
- types = fileInfo.types;
- writerVersion = fileInfo.writerVersion;
- // For multiple runs, in case sendSplitsInFooter changes
- if (fileMetaInfo == null && context.footerInSplits) {
- fileInfo.fileMetaInfo = ((ReaderImpl) orcReader).getFileMetaInfo();
- fileInfo.metadata = orcReader.getMetadata();
- fileInfo.types = orcReader.getTypes();
- fileInfo.writerVersion = orcReader.getWriterVersion();
- }
- } else {
- stripes = orcReader.getStripes();
- metadata = orcReader.getMetadata();
- types = orcReader.getTypes();
- writerVersion = orcReader.getWriterVersion();
- fileMetaInfo = context.footerInSplits ?
- ((ReaderImpl) orcReader).getFileMetaInfo() : null;
+ if (orcTail == null) {
+ Reader orcReader = OrcFile.createReader(file.getPath(),
+ OrcFile.readerOptions(context.conf)
+ .filesystem(fs)
+ .maxLength(file.getLen()));
+ orcTail = new OrcTail(orcReader.getFileTail(), orcReader.getSerializedFileFooter(), file.getModificationTime());
if (context.cacheStripeDetails) {
- // Populate into cache.
- Context.footerCache.put(file.getPath(),
- new FileInfo(file.getModificationTime(), file.getLen(), stripes,
- metadata, types, fileMetaInfo, writerVersion));
+ context.footerCache.put(file.getPath(), orcTail);
}
}
+
+ stripes = orcTail.getStripes();
+ stripeStats = orcTail.getStripeStatistics();
+ types = orcTail.getTypes();
+ writerVersion = orcTail.getWriterVersion();
includedCols = genIncludedColumns(types, context.conf, isOriginal);
- projColsUncompressedSize = computeProjectionSize(orcReader, includedCols, isOriginal);
+ List fileColStats = orcTail.getFooter().getStatisticsList();
+ projColsUncompressedSize = computeProjectionSize(types, fileColStats, includedCols,
+ isOriginal);
+ if (!context.footerInSplits) {
+ orcTail = null;
+ }
}
- private long computeProjectionSize(final Reader orcReader, final boolean[] includedCols,
- final boolean isOriginal) {
+ private long computeProjectionSize(List types,
+ List stats,
+ boolean[] includedCols,
+ boolean isOriginal) {
final int rootIdx = getRootColumn(isOriginal);
List internalColIds = Lists.newArrayList();
if (includedCols != null) {
@@ -1045,7 +1043,7 @@ private long computeProjectionSize(final Reader orcReader, final boolean[] inclu
}
}
}
- return orcReader.getRawDataSizeFromColIndices(internalColIds);
+ return ReaderImpl.getRawDataSizeFromColIndices(internalColIds, types, stats);
}
private boolean isStripeSatisfyPredicate(StripeStatistics stripeStatistics,
@@ -1149,37 +1147,6 @@ private static void cancelFutures(List> futures) {
return result.toArray(new InputSplit[result.size()]);
}
- /**
- * FileInfo.
- *
- * Stores information relevant to split generation for an ORC File.
- *
- */
- private static class FileInfo {
- long modificationTime;
- long size;
- List stripeInfos;
- ReaderImpl.FileMetaInfo fileMetaInfo;
- Metadata metadata;
- List types;
- private OrcFile.WriterVersion writerVersion;
-
-
- FileInfo(long modificationTime, long size,
- List stripeInfos,
- Metadata metadata, List types,
- ReaderImpl.FileMetaInfo fileMetaInfo,
- OrcFile.WriterVersion writerVersion) {
- this.modificationTime = modificationTime;
- this.size = size;
- this.stripeInfos = stripeInfos;
- this.fileMetaInfo = fileMetaInfo;
- this.metadata = metadata;
- this.types = types;
- this.writerVersion = writerVersion;
- }
- }
-
@SuppressWarnings("unchecked")
private org.apache.hadoop.mapred.RecordReader
createVectorizedReader(InputSplit split, JobConf conf, Reporter reporter
@@ -1199,13 +1166,17 @@ private static void cancelFutures(List> futures) {
if (vectorMode) {
return createVectorizedReader(inputSplit, conf, reporter);
} else {
+ OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(conf);
+ if (inputSplit instanceof OrcSplit) {
+ OrcSplit split = (OrcSplit) inputSplit;
+ readerOptions.maxLength(split.getFileLength()).orcTail(split.getOrcTail());
+ }
return new OrcRecordReader(OrcFile.createReader(
((FileSplit) inputSplit).getPath(),
- OrcFile.readerOptions(conf)), conf, (FileSplit) inputSplit);
+ readerOptions), conf, (FileSplit) inputSplit);
}
}
- OrcSplit split = (OrcSplit) inputSplit;
reporter.setStatus(inputSplit.toString());
Options options = new Options(conf).reporter(reporter);
@@ -1309,7 +1280,12 @@ public float getProgress() throws IOException {
if (split.hasBase()) {
bucket = AcidUtils.parseBaseBucketFilename(split.getPath(), conf)
.getBucket();
- reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
+ OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(conf)
+ .maxLength(split.getFileLength());
+ if (split.hasFooter()) {
+ readerOptions.orcTail(split.getOrcTail());
+ }
+ reader = OrcFile.createReader(path, readerOptions);
} else {
bucket = (int) split.getStart();
reader = null;
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcNewSplit.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcNewSplit.java
index b58c880..77d6eb5 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcNewSplit.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcNewSplit.java
@@ -34,7 +34,7 @@
*
*/
public class OrcNewSplit extends FileSplit {
- private ReaderImpl.FileMetaInfo fileMetaInfo;
+ private OrcTail orcTail;
private boolean hasFooter;
private boolean isOriginal;
private boolean hasBase;
@@ -51,7 +51,7 @@ protected OrcNewSplit(){
public OrcNewSplit(OrcSplit inner) throws IOException {
super(inner.getPath(), inner.getStart(), inner.getLength(),
inner.getLocations());
- this.fileMetaInfo = inner.getFileMetaInfo();
+ this.orcTail = inner.getOrcTail();
this.hasFooter = inner.hasFooter();
this.isOriginal = inner.isOriginal();
this.hasBase = inner.hasBase();
@@ -72,20 +72,11 @@ public void write(DataOutput out) throws IOException {
delta.write(out);
}
if (hasFooter) {
- // serialize FileMetaInfo fields
- Text.writeString(out, fileMetaInfo.compressionType);
- WritableUtils.writeVInt(out, fileMetaInfo.bufferSize);
- WritableUtils.writeVInt(out, fileMetaInfo.metadataSize);
-
- // serialize FileMetaInfo field footer
- ByteBuffer footerBuff = fileMetaInfo.footerBuffer;
- footerBuff.reset();
-
- // write length of buffer
- WritableUtils.writeVInt(out, footerBuff.limit() - footerBuff.position());
- out.write(footerBuff.array(), footerBuff.position(),
- footerBuff.limit() - footerBuff.position());
- WritableUtils.writeVInt(out, fileMetaInfo.writerVersion.getId());
+ OrcProto.FileTail fileTail = orcTail.getMinimalFileTail();
+ byte[] tailBuffer = fileTail.toByteArray();
+ int tailLen = tailBuffer.length;
+ WritableUtils.writeVInt(out, tailLen);
+ out.write(tailBuffer);
}
}
@@ -107,27 +98,14 @@ public void readFields(DataInput in) throws IOException {
deltas.add(dmd);
}
if (hasFooter) {
- // deserialize FileMetaInfo fields
- String compressionType = Text.readString(in);
- int bufferSize = WritableUtils.readVInt(in);
- int metadataSize = WritableUtils.readVInt(in);
-
- // deserialize FileMetaInfo field footer
- int footerBuffSize = WritableUtils.readVInt(in);
- ByteBuffer footerBuff = ByteBuffer.allocate(footerBuffSize);
- in.readFully(footerBuff.array(), 0, footerBuffSize);
- OrcFile.WriterVersion writerVersion =
- ReaderImpl.getWriterVersion(WritableUtils.readVInt(in));
-
- fileMetaInfo = new ReaderImpl.FileMetaInfo(compressionType, bufferSize,
- metadataSize, footerBuff, writerVersion);
+ int tailLen = WritableUtils.readVInt(in);
+ byte[] tailBuffer = new byte[tailLen];
+ in.readFully(tailBuffer);
+ OrcProto.FileTail fileTail = OrcProto.FileTail.parseFrom(tailBuffer);
+ orcTail = new OrcTail(fileTail, null);
}
}
- ReaderImpl.FileMetaInfo getFileMetaInfo(){
- return fileMetaInfo;
- }
-
public boolean hasFooter() {
return hasFooter;
}
@@ -143,4 +121,8 @@ public boolean hasBase() {
public List getDeltas() {
return deltas;
}
+
+ public OrcTail getOrcTail() {
+ return orcTail;
+ }
}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java
index 3e74df5..43ebdb4 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java
@@ -39,13 +39,14 @@
*
*/
public class OrcSplit extends FileSplit {
- private ReaderImpl.FileMetaInfo fileMetaInfo;
+ private OrcTail orcTail;
private boolean hasFooter;
private boolean isOriginal;
private boolean hasBase;
private final List deltas = new ArrayList<>();
private OrcFile.WriterVersion writerVersion;
private long projColsUncompressedSize;
+ private long fileLen;
static final int BASE_FLAG = 4;
static final int ORIGINAL_FLAG = 2;
@@ -59,15 +60,17 @@ protected OrcSplit(){
}
public OrcSplit(Path path, long offset, long length, String[] hosts,
- ReaderImpl.FileMetaInfo fileMetaInfo, boolean isOriginal, boolean hasBase,
- List deltas, long projectedDataSize) {
+ OrcTail orcTail, boolean isOriginal, boolean hasBase,
+ List deltas, long projectedDataSize, long fileLen) {
super(path, offset, length, hosts);
- this.fileMetaInfo = fileMetaInfo;
- hasFooter = this.fileMetaInfo != null;
+ this.orcTail = orcTail;
+ hasFooter = this.orcTail != null;
this.isOriginal = isOriginal;
this.hasBase = hasBase;
this.deltas.addAll(deltas);
this.projColsUncompressedSize = projectedDataSize;
+ // setting file length to Long.MAX_VALUE will let orc reader read file length from file system
+ this.fileLen = fileLen <= 0 ? Long.MAX_VALUE : fileLen;
}
@Override
@@ -84,21 +87,13 @@ public void write(DataOutput out) throws IOException {
delta.write(out);
}
if (hasFooter) {
- // serialize FileMetaInfo fields
- Text.writeString(out, fileMetaInfo.compressionType);
- WritableUtils.writeVInt(out, fileMetaInfo.bufferSize);
- WritableUtils.writeVInt(out, fileMetaInfo.metadataSize);
-
- // serialize FileMetaInfo field footer
- ByteBuffer footerBuff = fileMetaInfo.footerBuffer;
- footerBuff.reset();
-
- // write length of buffer
- WritableUtils.writeVInt(out, footerBuff.limit() - footerBuff.position());
- out.write(footerBuff.array(), footerBuff.position(),
- footerBuff.limit() - footerBuff.position());
- WritableUtils.writeVInt(out, fileMetaInfo.writerVersion.getId());
+ OrcProto.FileTail fileTail = orcTail.getMinimalFileTail();
+ byte[] tailBuffer = fileTail.toByteArray();
+ int tailLen = tailBuffer.length;
+ WritableUtils.writeVInt(out, tailLen);
+ out.write(tailBuffer);
}
+ out.writeLong(fileLen);
}
@Override
@@ -119,25 +114,13 @@ public void readFields(DataInput in) throws IOException {
deltas.add(dmd);
}
if (hasFooter) {
- // deserialize FileMetaInfo fields
- String compressionType = Text.readString(in);
- int bufferSize = WritableUtils.readVInt(in);
- int metadataSize = WritableUtils.readVInt(in);
-
- // deserialize FileMetaInfo field footer
- int footerBuffSize = WritableUtils.readVInt(in);
- ByteBuffer footerBuff = ByteBuffer.allocate(footerBuffSize);
- in.readFully(footerBuff.array(), 0, footerBuffSize);
- OrcFile.WriterVersion writerVersion =
- ReaderImpl.getWriterVersion(WritableUtils.readVInt(in));
-
- fileMetaInfo = new ReaderImpl.FileMetaInfo(compressionType, bufferSize,
- metadataSize, footerBuff, writerVersion);
+ int tailLen = WritableUtils.readVInt(in);
+ byte[] tailBuffer = new byte[tailLen];
+ in.readFully(tailBuffer);
+ OrcProto.FileTail fileTail = OrcProto.FileTail.parseFrom(tailBuffer);
+ orcTail = new OrcTail(fileTail, null);
}
- }
-
- ReaderImpl.FileMetaInfo getFileMetaInfo(){
- return fileMetaInfo;
+ fileLen = in.readLong();
}
public boolean hasFooter() {
@@ -168,4 +151,19 @@ public boolean isAcid() {
public long getProjectedColumnsUncompressedSize() {
return projColsUncompressedSize;
}
+
+ public long getFileLength() {
+ return fileLen;
+ }
+
+ public OrcTail getOrcTail() {
+ return orcTail;
+ }
+
+ @Override
+ public String toString() {
+ return "OrcSplit [" + getPath() + ", start=" + getStart() + ", length=" + getLength()
+ + ", isOriginal=" + isOriginal + ", fileLength=" + fileLen + ", hasFooter=" + hasFooter +
+ ", hasBase=" + hasBase + ", deltas=" + (deltas == null ? 0 : deltas.size()) + "]";
+ }
}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcTail.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcTail.java
new file mode 100644
index 0000000..54ca2f0
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcTail.java
@@ -0,0 +1,137 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.io.orc;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+public final class OrcTail {
+ // postscript + footer - Serialized in OrcSplit
+ private final OrcProto.FileTail fileTail;
+ // serialized representation of metadata, footer and postscript
+ private final ByteBuffer serializedTail;
+ // used to invalidate cache entries
+ private final long fileModificationTime;
+ private OrcProto.Metadata metadata;
+
+ public OrcTail(OrcProto.FileTail fileTail, ByteBuffer serializedTail) {
+ this(fileTail, serializedTail, -1);
+ }
+
+ public OrcTail(OrcProto.FileTail fileTail, ByteBuffer serializedTail, long fileModificationTime) {
+ this.fileTail = fileTail;
+ this.serializedTail = serializedTail;
+ this.fileModificationTime = fileModificationTime;
+ this.metadata = null; // lazily deserialized
+ }
+
+ public ByteBuffer getSerializedTail() {
+ return serializedTail;
+ }
+
+ public long getFileModificationTime() {
+ return fileModificationTime;
+ }
+
+ public OrcProto.Footer getFooter() {
+ return fileTail.getFooter();
+ }
+
+ public OrcProto.PostScript getPostScript() {
+ return fileTail.getPostscript();
+ }
+
+ public long getFileLength() {
+ return fileTail.getFileLength();
+ }
+
+ public OrcFile.WriterVersion getWriterVersion() {
+ OrcProto.PostScript ps = fileTail.getPostscript();
+ return ReaderImpl.getWriterVersion(ps.getWriterVersion());
+ }
+
+ public List getStripes() {
+ List result = new ArrayList<>();
+ for (OrcProto.StripeInformation stripeProto : fileTail.getFooter().getStripesList()) {
+ result.add(new ReaderImpl.StripeInformationImpl(stripeProto));
+ }
+ return result;
+ }
+
+ public CompressionKind getCompressionKind() {
+ return CompressionKind.valueOf(fileTail.getPostscript().getCompression().name());
+ }
+
+ public CompressionCodec getCompressionCodec() {
+ return WriterImpl.createCodec(getCompressionKind());
+ }
+
+ public int getCompressionBufferSize() {
+ return (int) fileTail.getPostscript().getCompressionBlockSize();
+ }
+
+ public List getStripeStatistics() throws IOException {
+ List result = new ArrayList<>();
+ List ssProto = getStripeStatisticsProto();
+ if (ssProto != null) {
+ for (OrcProto.StripeStatistics ss : ssProto) {
+ result.add(new StripeStatistics(ss.getColStatsList()));
+ }
+ }
+ return result;
+ }
+
+ public List getStripeStatisticsProto() throws IOException {
+ if (getMetadata() == null) {
+ return null;
+ }
+ return getMetadata().getStripeStatsList();
+ }
+
+ public int getMetadataSize() {
+ return (int) getPostScript().getMetadataLength();
+ }
+
+ public List getTypes() {
+ return getFooter().getTypesList();
+ }
+
+ public OrcProto.FileTail getFileTail() {
+ return fileTail;
+ }
+
+ public OrcProto.FileTail getMinimalFileTail() {
+ OrcProto.FileTail.Builder fileTailBuilder = OrcProto.FileTail.newBuilder(fileTail);
+ OrcProto.Footer.Builder footerBuilder = OrcProto.Footer.newBuilder(fileTail.getFooter());
+ footerBuilder.clearStatistics();
+ fileTailBuilder.setFooter(footerBuilder.build());
+ OrcProto.FileTail result = fileTailBuilder.build();
+ return result;
+ }
+
+ public OrcProto.Metadata getMetadata() throws IOException {
+ if (serializedTail == null) return null;
+ if (metadata == null) {
+ metadata = ReaderImpl.extractMetadata(serializedTail, 0,
+ (int) fileTail.getPostscript().getMetadataLength(),
+ getCompressionCodec(), getCompressionBufferSize());
+ }
+ return metadata;
+ }
+}
\ No newline at end of file
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java
index 6dbe461..3105738 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java
@@ -146,6 +146,10 @@
*/
OrcFile.WriterVersion getWriterVersion();
+ OrcProto.FileTail getFileTail();
+
+ ByteBuffer getSerializedFileFooter();
+
/**
* Options for creating a RecordReader.
*/
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
index d42675c..1b7f6d6 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
@@ -19,6 +19,7 @@
package org.apache.hadoop.hive.ql.io.orc;
import java.io.IOException;
+import java.io.InputStream;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
@@ -30,6 +31,7 @@
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.DiskRange;
@@ -72,6 +74,7 @@
// will help avoid cpu cycles spend in deserializing at cost of increased
// memory footprint.
private final ByteBuffer footerByteBuffer;
+ private final OrcTail tail;
static class StripeInformationImpl
implements StripeInformation {
@@ -308,29 +311,23 @@ public ReaderImpl(Path path, OrcFile.ReaderOptions options) throws IOException {
this.path = path;
this.conf = options.getConfiguration();
- FileMetaInfo footerMetaData;
- if (options.getFileMetaInfo() != null) {
- footerMetaData = options.getFileMetaInfo();
+ OrcTail orcTail = options.getOrcTail();
+ if (orcTail == null) {
+ tail = extractFileTail(fs, path, options.getMaxLength());
+ options.orcTail(tail);
} else {
- footerMetaData = extractMetaInfoFromFooter(fs, path,
- options.getMaxLength());
+ tail = orcTail;
}
- MetaInfoObjExtractor rInfo =
- new MetaInfoObjExtractor(footerMetaData.compressionType,
- footerMetaData.bufferSize,
- footerMetaData.metadataSize,
- footerMetaData.footerBuffer
- );
- this.footerByteBuffer = footerMetaData.footerBuffer;
- this.compressionKind = rInfo.compressionKind;
- this.codec = rInfo.codec;
- this.bufferSize = rInfo.bufferSize;
- this.metadataSize = rInfo.metadataSize;
- this.metadata = rInfo.metadata;
- this.footer = rInfo.footer;
- this.inspector = rInfo.inspector;
- this.versionList = footerMetaData.versionList;
- this.writerVersion = footerMetaData.writerVersion;
+ this.footerByteBuffer = tail.getSerializedTail();
+ this.compressionKind = tail.getCompressionKind();
+ this.codec = tail.getCompressionCodec();
+ this.bufferSize = tail.getCompressionBufferSize();
+ this.metadataSize = tail.getMetadataSize();
+ this.versionList = tail.getPostScript().getVersionList();
+ this.footer = tail.getFooter();
+ this.writerVersion = tail.getWriterVersion();
+ this.metadata = tail.getMetadata();
+ this.inspector = OrcStruct.createObjectInspector(0, tail.getTypes());
}
/**
@@ -347,7 +344,7 @@ public ReaderImpl(Path path, OrcFile.ReaderOptions options) throws IOException {
return OrcFile.WriterVersion.ORIGINAL;
}
- private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs,
+ private static OrcTail extractFileTail(FileSystem fs,
Path path,
long maxFileLength
) throws IOException {
@@ -355,14 +352,20 @@ private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs,
ByteBuffer buffer = null;
OrcProto.PostScript ps = null;
OrcFile.WriterVersion writerVersion = null;
+ long modificationTime;
+ OrcProto.FileTail.Builder fileTailBuilder = OrcProto.FileTail.newBuilder();
try {
// figure out the size of the file using the option or filesystem
long size;
if (maxFileLength == Long.MAX_VALUE) {
- size = fs.getFileStatus(path).getLen();
+ FileStatus fileStatus = fs.getFileStatus(path);
+ size = fileStatus.getLen();
+ modificationTime = fileStatus.getModificationTime();
} else {
size = maxFileLength;
+ modificationTime = -1;
}
+ fileTailBuilder.setFileLength(size);
//read last bytes into buffer to get PostScript
int readSize = (int) Math.min(size, DIRECTORY_SIZE_GUESS);
@@ -416,6 +419,8 @@ private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs,
buffer = extraBuf;
buffer.position(0);
buffer.limit(footerSize + metadataSize);
+ readSize += extra;
+ psOffset = readSize - 1 - psLen;
} else {
//footer is already in the bytes in buffer, just adjust position, length
buffer.position(psOffset - footerSize - metadataSize);
@@ -424,6 +429,17 @@ private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs,
// remember position for later
buffer.mark();
+ int bufferSize = (int) ps.getCompressionBlockSize();
+ CompressionCodec codec = WriterImpl.createCodec(CompressionKind.valueOf(ps.getCompression().name()));
+ fileTailBuilder.setPostscriptLength(psLen).setPostscript(ps);
+ int footerOffset = psOffset - footerSize;
+ buffer.position(footerOffset);
+ ByteBuffer footerBuffer = buffer.slice();
+ buffer.reset();
+ OrcProto.Footer footer = extractFooter(footerBuffer, 0, footerSize,
+ codec, bufferSize);
+ fileTailBuilder.setFooter(footer);
+
} finally {
try {
file.close();
@@ -431,101 +447,30 @@ private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs,
LOG.error("Failed to close the file after another error", ex);
}
}
-
- return new FileMetaInfo(
- ps.getCompression().toString(),
- (int) ps.getCompressionBlockSize(),
- (int) ps.getMetadataLength(),
- buffer,
- ps.getVersionList(),
- writerVersion
- );
- }
-
-
-
- /**
- * MetaInfoObjExtractor - has logic to create the values for the fields in ReaderImpl
- * from serialized fields.
- * As the fields are final, the fields need to be initialized in the constructor and
- * can't be done in some helper function. So this helper class is used instead.
- *
- */
- private static class MetaInfoObjExtractor{
- final CompressionKind compressionKind;
- final CompressionCodec codec;
- final int bufferSize;
- final int metadataSize;
- final OrcProto.Metadata metadata;
- final OrcProto.Footer footer;
- final ObjectInspector inspector;
-
- MetaInfoObjExtractor(String codecStr, int bufferSize, int metadataSize,
- ByteBuffer footerBuffer) throws IOException {
-
- this.compressionKind = CompressionKind.valueOf(codecStr.toUpperCase());
- this.bufferSize = bufferSize;
- this.codec = WriterImpl.createCodec(compressionKind);
- this.metadataSize = metadataSize;
-
- int position = footerBuffer.position();
- int footerBufferSize = footerBuffer.limit() - footerBuffer.position() - metadataSize;
- footerBuffer.limit(position + metadataSize);
-
- CodedInputStream instream = InStream.createCodedInputStream("metadata",
- Lists.newArrayList(new BufferChunk(footerBuffer, 0)), metadataSize,
- codec, bufferSize);
- this.metadata = OrcProto.Metadata.parseFrom(instream);
-
- footerBuffer.position(position + metadataSize);
- footerBuffer.limit(position + metadataSize + footerBufferSize);
- instream = InStream.createCodedInputStream("footer", Lists.newArrayList(
- new BufferChunk(footerBuffer, 0)), footerBufferSize, codec, bufferSize);
- this.footer = OrcProto.Footer.parseFrom(instream);
-
- footerBuffer.position(position);
- this.inspector = OrcStruct.createObjectInspector(0, footer.getTypesList());
- }
+ return new OrcTail(fileTailBuilder.build(), buffer.slice(), modificationTime);
}
- /**
- * FileMetaInfo - represents file metadata stored in footer and postscript sections of the file
- * that is useful for Reader implementation
- *
- */
- static class FileMetaInfo{
- final String compressionType;
- final int bufferSize;
- final int metadataSize;
- final ByteBuffer footerBuffer;
- final List versionList;
- final OrcFile.WriterVersion writerVersion;
-
- FileMetaInfo(String compressionType, int bufferSize, int metadataSize,
- ByteBuffer footerBuffer, OrcFile.WriterVersion writerVersion) {
- this(compressionType, bufferSize, metadataSize, footerBuffer, null,
- writerVersion);
- }
-
- FileMetaInfo(String compressionType, int bufferSize, int metadataSize,
- ByteBuffer footerBuffer, List versionList,
- OrcFile.WriterVersion writerVersion){
- this.compressionType = compressionType;
- this.bufferSize = bufferSize;
- this.metadataSize = metadataSize;
- this.footerBuffer = footerBuffer;
- this.versionList = versionList;
- this.writerVersion = writerVersion;
- }
+ private static OrcProto.Footer extractFooter(ByteBuffer bb, int footerAbsPos,
+ int footerSize, CompressionCodec codec,
+ int bufferSize) throws IOException {
+ bb.position(footerAbsPos);
+ bb.limit(footerAbsPos + footerSize);
+ InputStream instream = InStream.create("footer",
+ Lists.newArrayList(new BufferChunk(bb, 0)), footerSize, codec, bufferSize);
+ CodedInputStream in = CodedInputStream.newInstance(instream);
+ return OrcProto.Footer.parseFrom(in);
}
- public FileMetaInfo getFileMetaInfo(){
- return new FileMetaInfo(compressionKind.toString(), bufferSize,
- metadataSize, footerByteBuffer, versionList, writerVersion);
+ public static OrcProto.Metadata extractMetadata(ByteBuffer bb, int metadataAbsPos,
+ int metadataSize, CompressionCodec codec, int bufferSize) throws IOException {
+ bb.position(metadataAbsPos);
+ bb.limit(metadataAbsPos + metadataSize);
+ InputStream inputStream = InStream.create("metadata",
+ Lists.newArrayList(new BufferChunk(bb, 0)), metadataSize, codec, bufferSize);
+ CodedInputStream in = CodedInputStream.newInstance(inputStream);
+ return OrcProto.Metadata.parseFrom(in);
}
-
-
@Override
public RecordReader rows() throws IOException {
return rowsOptions(new Options());
@@ -577,24 +522,34 @@ public long getRawDataSize() {
for (int i = 0; i < stats.size(); ++i) {
indices.add(i);
}
- deserializedSize = getRawDataSizeFromColIndices(indices);
+ deserializedSize = getRawDataSizeFromColIndices(indices, footer.getTypesList(), stats);
}
return deserializedSize;
}
@Override
- public long getRawDataSizeFromColIndices(List colIndices) {
+ public OrcProto.FileTail getFileTail() {
+ return tail.getFileTail();
+ }
+
+ @Override
+ public ByteBuffer getSerializedFileFooter() {
+ return tail.getSerializedTail();
+ }
+
+ public static long getRawDataSizeFromColIndices(List colIndices, List types,
+ List stats) {
long result = 0;
for (int colIdx : colIndices) {
- result += getRawDataSizeOfColumn(colIdx);
+ result += getRawDataSizeOfColumn(colIdx, types, stats);
}
return result;
}
- private long getRawDataSizeOfColumn(int colIdx) {
- OrcProto.ColumnStatistics colStat = footer.getStatistics(colIdx);
+ private static long getRawDataSizeOfColumn(int colIdx, List types, List stats) {
+ OrcProto.ColumnStatistics colStat = stats.get(colIdx);
long numVals = colStat.getNumberOfValues();
- Type type = footer.getTypes(colIdx);
+ Type type = types.get(colIdx);
switch (type.getKind()) {
case BINARY:
@@ -638,7 +593,12 @@ private long getRawDataSizeOfColumn(int colIdx) {
@Override
public long getRawDataSizeOfColumns(List colNames) {
List colIndices = getColumnIndicesFromNames(colNames);
- return getRawDataSizeFromColIndices(colIndices);
+ return getRawDataSizeFromColIndices(colIndices, tail.getTypes(), tail.getFooter().getStatisticsList());
+ }
+
+ @Override
+ public long getRawDataSizeFromColIndices(List colIds) {
+ return getRawDataSizeFromColIndices(colIds, tail.getTypes(), tail.getFooter().getStatisticsList());
}
private List getColumnIndicesFromNames(List colNames) {
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java
index 64b426c..330ac12 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java
@@ -168,8 +168,9 @@ public VectorizedOrcInputFormat() {
if(fSplit instanceof OrcSplit){
OrcSplit orcSplit = (OrcSplit) fSplit;
if (orcSplit.hasFooter()) {
- opts.fileMetaInfo(orcSplit.getFileMetaInfo());
+ opts.orcTail(orcSplit.getOrcTail());
}
+ opts.maxLength(orcSplit.getFileLength());
}
Reader reader = OrcFile.createReader(path, opts);
return new VectorizedOrcRecordReader(reader, conf, fSplit);
diff --git a/ql/src/protobuf/org/apache/hadoop/hive/ql/io/orc/orc_proto.proto b/ql/src/protobuf/org/apache/hadoop/hive/ql/io/orc/orc_proto.proto
index 06d0b07..5505efe 100644
--- a/ql/src/protobuf/org/apache/hadoop/hive/ql/io/orc/orc_proto.proto
+++ b/ql/src/protobuf/org/apache/hadoop/hive/ql/io/orc/orc_proto.proto
@@ -220,3 +220,11 @@ message PostScript {
// Leave this last in the record
optional string magic = 8000;
}
+
+// This gets serialized as part of OrcSplit, also used by footer cache.
+message FileTail {
+ optional PostScript postscript = 1;
+ optional Footer footer = 2;
+ optional uint64 fileLength = 3;
+ optional uint64 postscriptLength = 4;
+}
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
index 3e7565e..31d561b 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
@@ -19,6 +19,7 @@
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.DataInput;
@@ -33,9 +34,11 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
+import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
+import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.TimeZone;
@@ -66,14 +69,12 @@
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx;
import org.apache.hadoop.hive.ql.io.AcidInputFormat;
import org.apache.hadoop.hive.ql.io.AcidOutputFormat;
-import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
import org.apache.hadoop.hive.ql.io.HiveInputFormat;
import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
import org.apache.hadoop.hive.ql.io.IOConstants;
import org.apache.hadoop.hive.ql.io.InputFormatChecker;
import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.SplitStrategy;
-import org.apache.hadoop.hive.ql.io.orc.TestOrcRawRecordMerger.MyRow;
import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory;
@@ -112,9 +113,6 @@
import org.junit.Test;
import org.junit.rules.TestName;
-import com.esotericsoftware.kryo.Kryo;
-import com.esotericsoftware.kryo.io.Output;
-
public class TestInputOutputFormat {
Path workDir = new Path(System.getProperty("test.tmp.dir","target/tmp"));
@@ -714,6 +712,17 @@ public MockFile(String path, int blockSize, byte[] content,
}
@Override
+ public int hashCode() {
+ return path.hashCode() + 31 * length;
+ }
+
+ @Override
+ public boolean equals(final Object obj) {
+ if (!(obj instanceof MockFile)) { return false; }
+ return ((MockFile) obj).path.equals(this.path) && ((MockFile) obj).length == this.length;
+ }
+
+ @Override
public String toString() {
StringBuilder buffer = new StringBuilder();
buffer.append("mockFile{path: ");
@@ -824,6 +833,7 @@ public String toString() {
public static class MockFileSystem extends FileSystem {
final List files = new ArrayList();
+ final Map fileStatusMap = new HashMap<>();
Path workingDir = new Path("/");
protected Statistics statistics;
@@ -857,6 +867,19 @@ public URI getUri() {
}
}
+ // increments file modification time
+ public void touch(MockFile file) {
+ if (fileStatusMap.containsKey(file)) {
+ FileStatus fileStatus = fileStatusMap.get(file);
+ FileStatus fileStatusNew = new FileStatus(fileStatus.getLen(), fileStatus.isDirectory(),
+ fileStatus.getReplication(), fileStatus.getBlockSize(),
+ fileStatus.getModificationTime() + 1, fileStatus.getAccessTime(),
+ fileStatus.getPermission(), fileStatus.getOwner(), fileStatus.getGroup(),
+ fileStatus.getPath());
+ fileStatusMap.put(file, fileStatusNew);
+ }
+ }
+
@Override
public FSDataInputStream open(Path path, int i) throws IOException {
statistics.incrementReadOps(1);
@@ -1038,9 +1061,14 @@ private LocatedFileStatus createLocatedDirectory(Path dir) throws IOException {
}
private FileStatus createStatus(MockFile file) {
- return new FileStatus(file.length, false, 1, file.blockSize, 0, 0,
+ if (fileStatusMap.containsKey(file)) {
+ return fileStatusMap.get(file);
+ }
+ FileStatus fileStatus = new FileStatus(file.length, false, 1, file.blockSize, 0, 0,
FsPermission.createImmutable((short) 644), "owen", "group",
file.path);
+ fileStatusMap.put(file, fileStatus);
+ return fileStatus;
}
private FileStatus createDirectory(Path dir) {
@@ -2286,4 +2314,935 @@ public void testSplitGenReadOps() throws Exception {
// revert back to local fs
conf.set("fs.defaultFS", "file:///");
}
+
+ @Test
+ public void testSplitGenReadOpsLocalCache() throws Exception {
+ MockFileSystem fs = new MockFileSystem(conf);
+ // creates the static cache
+ MockPath mockPath = new MockPath(fs, "mock:///mocktbl");
+ conf.set("hive.orc.cache.stripe.details.size", "-1");
+ conf.set("mapred.input.dir", mockPath.toString());
+ conf.set("fs.defaultFS", "mock:///");
+ conf.set("fs.mock.impl", MockFileSystem.class.getName());
+ StructObjectInspector inspector;
+ synchronized (TestOrcFile.class) {
+ inspector = (StructObjectInspector)
+ ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
+ ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+ Writer writer =
+ OrcFile.createWriter(new Path(mockPath + "/0_0"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 10; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ writer = OrcFile.createWriter(new Path(mockPath + "/0_1"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 10; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ int readOpsBefore = -1;
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsBefore = statistics.getReadOps();
+ }
+ }
+ assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
+ OrcInputFormat orcInputFormat = new OrcInputFormat();
+ InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
+ assertEquals(2, splits.length);
+ int readOpsDelta = -1;
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsDelta = statistics.getReadOps() - readOpsBefore;
+ }
+ }
+ // call-1: listLocatedStatus - mock:/mocktbl
+ // call-2: open - mock:/mocktbl/0_0
+ // call-3: open - mock:/mocktbl/0_1
+ assertEquals(3, readOpsDelta);
+
+ // force BI to avoid reading footers
+ conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "BI");
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsBefore = statistics.getReadOps();
+ }
+ }
+ orcInputFormat = new OrcInputFormat();
+ splits = orcInputFormat.getSplits(conf, 2);
+ assertEquals(2, splits.length);
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsDelta = statistics.getReadOps() - readOpsBefore;
+ }
+ }
+ // call-1: listLocatedStatus - mock:/mocktbl
+ assertEquals(1, readOpsDelta);
+
+ // enable cache and use default strategy
+ conf.set("hive.orc.cache.stripe.details.size", "100");
+ conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "HYBRID");
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsBefore = statistics.getReadOps();
+ }
+ }
+ orcInputFormat = new OrcInputFormat();
+ splits = orcInputFormat.getSplits(conf, 2);
+ assertEquals(2, splits.length);
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsDelta = statistics.getReadOps() - readOpsBefore;
+ }
+ }
+ // call-1: listLocatedStatus - mock:/mocktbl
+ // call-2: open - mock:/mocktbl/0_0
+ // call-3: open - mock:/mocktbl/0_1
+ assertEquals(3, readOpsDelta);
+
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsBefore = statistics.getReadOps();
+ }
+ }
+ orcInputFormat = new OrcInputFormat();
+ splits = orcInputFormat.getSplits(conf, 2);
+ assertEquals(2, splits.length);
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsDelta = statistics.getReadOps() - readOpsBefore;
+ }
+ }
+ // call-1: listLocatedStatus - mock:/mocktbl
+ assertEquals(1, readOpsDelta);
+
+ // revert back to local fs
+ conf.set("fs.defaultFS", "file:///");
+ }
+
+ @Test
+ public void testSplitGenReadOpsLocalCacheChangeFileLen() throws Exception {
+ MockFileSystem fs = new MockFileSystem(conf);
+ // creates the static cache
+ MockPath mockPath = new MockPath(fs, "mock:///mocktbl1");
+ conf.set("mapred.input.dir", mockPath.toString());
+ conf.set("fs.defaultFS", "mock:///");
+ conf.set("fs.mock.impl", MockFileSystem.class.getName());
+ StructObjectInspector inspector;
+ synchronized (TestOrcFile.class) {
+ inspector = (StructObjectInspector)
+ ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
+ ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+ Writer writer =
+ OrcFile.createWriter(new Path(mockPath + "/0_0"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 10; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ writer = OrcFile.createWriter(new Path(mockPath + "/0_1"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 10; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ int readOpsBefore = -1;
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsBefore = statistics.getReadOps();
+ }
+ }
+ assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
+ OrcInputFormat orcInputFormat = new OrcInputFormat();
+ InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
+ assertEquals(2, splits.length);
+ int readOpsDelta = -1;
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsDelta = statistics.getReadOps() - readOpsBefore;
+ }
+ }
+ // call-1: listLocatedStatus - mock:/mocktable
+ // call-2: open - mock:/mocktbl1/0_0
+ // call-3: open - mock:/mocktbl1/0_1
+ assertEquals(3, readOpsDelta);
+
+ // change file length and look for cache misses
+
+ fs.clear();
+
+ writer =
+ OrcFile.createWriter(new Path(mockPath + "/0_0"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 100; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ writer = OrcFile.createWriter(new Path(mockPath + "/0_1"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 100; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsBefore = statistics.getReadOps();
+ }
+ }
+ orcInputFormat = new OrcInputFormat();
+ splits = orcInputFormat.getSplits(conf, 2);
+ assertEquals(2, splits.length);
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsDelta = statistics.getReadOps() - readOpsBefore;
+ }
+ }
+ // call-1: listLocatedStatus - mock:/mocktable
+ // call-2: open - mock:/mocktbl1/0_0
+ // call-3: open - mock:/mocktbl1/0_1
+ assertEquals(3, readOpsDelta);
+
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsBefore = statistics.getReadOps();
+ }
+ }
+ orcInputFormat = new OrcInputFormat();
+ splits = orcInputFormat.getSplits(conf, 2);
+ assertEquals(2, splits.length);
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsDelta = statistics.getReadOps() - readOpsBefore;
+ }
+ }
+ // call-1: listLocatedStatus - mock:/mocktbl1
+ assertEquals(1, readOpsDelta);
+
+ // revert back to local fs
+ conf.set("fs.defaultFS", "file:///");
+ }
+
+ @Test
+ public void testSplitGenReadOpsLocalCacheChangeModificationTime() throws Exception {
+ MockFileSystem fs = new MockFileSystem(conf);
+ // creates the static cache
+ MockPath mockPath = new MockPath(fs, "mock:///mocktbl2");
+ conf.set("hive.orc.cache.use.soft.references", "true");
+ conf.set("mapred.input.dir", mockPath.toString());
+ conf.set("fs.defaultFS", "mock:///");
+ conf.set("fs.mock.impl", MockFileSystem.class.getName());
+ StructObjectInspector inspector;
+ synchronized (TestOrcFile.class) {
+ inspector = (StructObjectInspector)
+ ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
+ ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+ Writer writer =
+ OrcFile.createWriter(new Path(mockPath + "/0_0"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 10; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ writer = OrcFile.createWriter(new Path(mockPath + "/0_1"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 10; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ int readOpsBefore = -1;
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsBefore = statistics.getReadOps();
+ }
+ }
+ assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
+ OrcInputFormat orcInputFormat = new OrcInputFormat();
+ InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
+ assertEquals(2, splits.length);
+ int readOpsDelta = -1;
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsDelta = statistics.getReadOps() - readOpsBefore;
+ }
+ }
+ // call-1: listLocatedStatus - mock:/mocktbl2
+ // call-2: open - mock:/mocktbl2/0_0
+ // call-3: open - mock:/mocktbl2/0_1
+ assertEquals(3, readOpsDelta);
+
+ // change file modification time and look for cache misses
+ FileSystem fs1 = FileSystem.get(conf);
+ MockFile mockFile = ((MockFileSystem) fs1).findFile(new Path(mockPath + "/0_0"));
+ ((MockFileSystem) fs1).touch(mockFile);
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsBefore = statistics.getReadOps();
+ }
+ }
+ orcInputFormat = new OrcInputFormat();
+ splits = orcInputFormat.getSplits(conf, 2);
+ assertEquals(2, splits.length);
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsDelta = statistics.getReadOps() - readOpsBefore;
+ }
+ }
+ // call-1: listLocatedStatus - mock:/mocktbl2
+ // call-2: open - mock:/mocktbl2/0_1
+ assertEquals(2, readOpsDelta);
+
+ // touch the next file
+ fs1 = FileSystem.get(conf);
+ mockFile = ((MockFileSystem) fs1).findFile(new Path(mockPath + "/0_1"));
+ ((MockFileSystem) fs1).touch(mockFile);
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsBefore = statistics.getReadOps();
+ }
+ }
+ orcInputFormat = new OrcInputFormat();
+ splits = orcInputFormat.getSplits(conf, 2);
+ assertEquals(2, splits.length);
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsDelta = statistics.getReadOps() - readOpsBefore;
+ }
+ }
+ // call-1: listLocatedStatus - mock:/mocktbl2
+ // call-2: open - mock:/mocktbl2/0_0
+ assertEquals(2, readOpsDelta);
+
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsBefore = statistics.getReadOps();
+ }
+ }
+ orcInputFormat = new OrcInputFormat();
+ splits = orcInputFormat.getSplits(conf, 2);
+ assertEquals(2, splits.length);
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsDelta = statistics.getReadOps() - readOpsBefore;
+ }
+ }
+ // call-1: listLocatedStatus - mock:/mocktbl2
+ assertEquals(1, readOpsDelta);
+
+ // revert back to local fs
+ conf.set("fs.defaultFS", "file:///");
+ }
+
+ @Test
+ public void testNonVectorReaderNoFooterSerialize() throws Exception {
+ MockFileSystem fs = new MockFileSystem(conf);
+ MockPath mockPath = new MockPath(fs, "mock:///mocktable1");
+ conf.set("hive.orc.splits.include.file.footer", "false");
+ conf.set("mapred.input.dir", mockPath.toString());
+ conf.set("fs.defaultFS", "mock:///");
+ conf.set("fs.mock.impl", MockFileSystem.class.getName());
+ StructObjectInspector inspector;
+ synchronized (TestOrcFile.class) {
+ inspector = (StructObjectInspector)
+ ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
+ ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+ Writer writer =
+ OrcFile.createWriter(new Path(mockPath + "/0_0"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 10; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ writer = OrcFile.createWriter(new Path(mockPath + "/0_1"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 10; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ OrcInputFormat orcInputFormat = new OrcInputFormat();
+ InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
+ assertEquals(2, splits.length);
+ int readOpsBefore = -1;
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsBefore = statistics.getReadOps();
+ }
+ }
+ assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
+
+ for (InputSplit split : splits) {
+ assertTrue("OrcSplit is expected", split instanceof OrcSplit);
+ // ETL strategies will have start=3 (start of first stripe)
+ assertTrue(split.toString().contains("start=3"));
+ assertTrue(split.toString().contains("hasFooter=false"));
+ assertTrue(split.toString().contains("hasBase=true"));
+ assertTrue(split.toString().contains("deltas=0"));
+ if (split instanceof OrcSplit) {
+ assertFalse("No footer serialize test for non-vector reader, hasFooter is not expected in" +
+ " orc splits.", ((OrcSplit) split).hasFooter());
+ }
+ orcInputFormat.getRecordReader(split, conf, null);
+ }
+
+ int readOpsDelta = -1;
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsDelta = statistics.getReadOps() - readOpsBefore;
+ }
+ }
+ // call-1: open to read footer - split 1 => mock:/mocktable1/0_0
+ // call-2: open to read data - split 1 => mock:/mocktable1/0_0
+ // call-3: open to read footer - split 2 => mock:/mocktable1/0_1
+ // call-4: open to read data - split 2 => mock:/mocktable1/0_1
+ assertEquals(4, readOpsDelta);
+
+ // revert back to local fs
+ conf.set("fs.defaultFS", "file:///");
+ }
+
+ @Test
+ public void testNonVectorReaderFooterSerialize() throws Exception {
+ MockFileSystem fs = new MockFileSystem(conf);
+ MockPath mockPath = new MockPath(fs, "mock:///mocktable2");
+ conf.set("hive.orc.splits.include.file.footer", "true");
+ conf.set("mapred.input.dir", mockPath.toString());
+ conf.set("fs.defaultFS", "mock:///");
+ conf.set("fs.mock.impl", MockFileSystem.class.getName());
+ StructObjectInspector inspector;
+ synchronized (TestOrcFile.class) {
+ inspector = (StructObjectInspector)
+ ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
+ ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+ Writer writer =
+ OrcFile.createWriter(new Path(mockPath + "/0_0"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 10; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ writer = OrcFile.createWriter(new Path(mockPath + "/0_1"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 10; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ OrcInputFormat orcInputFormat = new OrcInputFormat();
+ InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
+ assertEquals(2, splits.length);
+ int readOpsBefore = -1;
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsBefore = statistics.getReadOps();
+ }
+ }
+ assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
+
+ for (InputSplit split : splits) {
+ assertTrue("OrcSplit is expected", split instanceof OrcSplit);
+ // ETL strategies will have start=3 (start of first stripe)
+ assertTrue(split.toString().contains("start=3"));
+ assertTrue(split.toString().contains("hasFooter=true"));
+ assertTrue(split.toString().contains("hasBase=true"));
+ assertTrue(split.toString().contains("deltas=0"));
+ if (split instanceof OrcSplit) {
+ assertTrue("Footer serialize test for non-vector reader, hasFooter is expected in" +
+ " orc splits.", ((OrcSplit) split).hasFooter());
+ }
+ orcInputFormat.getRecordReader(split, conf, null);
+ }
+
+ int readOpsDelta = -1;
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsDelta = statistics.getReadOps() - readOpsBefore;
+ }
+ }
+ // call-1: open to read data - split 1 => mock:/mocktable2/0_0
+ // call-2: open to read data - split 2 => mock:/mocktable2/0_1
+ assertEquals(2, readOpsDelta);
+
+ // revert back to local fs
+ conf.set("fs.defaultFS", "file:///");
+ }
+
+ @Test
+ public void testVectorReaderNoFooterSerialize() throws Exception {
+ MockFileSystem fs = new MockFileSystem(conf);
+ MockPath mockPath = new MockPath(fs, "mock:///mocktable3");
+ conf.set("hive.orc.splits.include.file.footer", "false");
+ conf.set("mapred.input.dir", mockPath.toString());
+ conf.set("fs.defaultFS", "mock:///");
+ conf.set("fs.mock.impl", MockFileSystem.class.getName());
+ StructObjectInspector inspector;
+ synchronized (TestOrcFile.class) {
+ inspector = (StructObjectInspector)
+ ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
+ ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+ JobConf jobConf = createMockExecutionEnvironment(workDir, new Path("mock:///"),
+ "mocktable3", inspector, true, 0);
+ Writer writer =
+ OrcFile.createWriter(new Path(mockPath + "/0_0"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 10; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ writer = OrcFile.createWriter(new Path(mockPath + "/0_1"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 10; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ OrcInputFormat orcInputFormat = new OrcInputFormat();
+ InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
+ assertEquals(2, splits.length);
+
+ int readOpsBefore = -1;
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsBefore = statistics.getReadOps();
+ }
+ }
+ assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
+
+ for (InputSplit split : splits) {
+ assertTrue("OrcSplit is expected", split instanceof OrcSplit);
+ // ETL strategies will have start=3 (start of first stripe)
+ assertTrue(split.toString().contains("start=3"));
+ assertTrue(split.toString().contains("hasFooter=false"));
+ assertTrue(split.toString().contains("hasBase=true"));
+ assertTrue(split.toString().contains("deltas=0"));
+ if (split instanceof OrcSplit) {
+ assertFalse("No footer serialize test for vector reader, hasFooter is not expected in" +
+ " orc splits.", ((OrcSplit) split).hasFooter());
+ }
+ orcInputFormat.getRecordReader(split, jobConf, Reporter.NULL);
+ }
+
+ int readOpsDelta = -1;
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsDelta = statistics.getReadOps() - readOpsBefore;
+ }
+ }
+ // call-1: open to read footer - split 1 => mock:/mocktable3/0_0
+ // call-2: open to read data - split 1 => mock:/mocktable3/0_0
+ // call-3: open to read footer - split 2 => mock:/mocktable3/0_1
+ // call-4: open to read data - split 2 => mock:/mocktable3/0_1
+ assertEquals(4, readOpsDelta);
+
+ // revert back to local fs
+ conf.set("fs.defaultFS", "file:///");
+ }
+
+ @Test
+ public void testVectorReaderFooterSerialize() throws Exception {
+ MockFileSystem fs = new MockFileSystem(conf);
+ MockPath mockPath = new MockPath(fs, "mock:///mocktable4");
+ conf.set("hive.orc.splits.include.file.footer", "true");
+ conf.set("mapred.input.dir", mockPath.toString());
+ conf.set("fs.defaultFS", "mock:///");
+ conf.set("fs.mock.impl", MockFileSystem.class.getName());
+ StructObjectInspector inspector;
+ synchronized (TestOrcFile.class) {
+ inspector = (StructObjectInspector)
+ ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
+ ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+ JobConf jobConf = createMockExecutionEnvironment(workDir, new Path("mock:///"),
+ "mocktable4", inspector, true, 0);
+ Writer writer =
+ OrcFile.createWriter(new Path(mockPath + "/0_0"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 10; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ writer = OrcFile.createWriter(new Path(mockPath + "/0_1"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 10; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ OrcInputFormat orcInputFormat = new OrcInputFormat();
+ InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
+ assertEquals(2, splits.length);
+
+ int readOpsBefore = -1;
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsBefore = statistics.getReadOps();
+ }
+ }
+ assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
+
+ for (InputSplit split : splits) {
+ assertTrue("OrcSplit is expected", split instanceof OrcSplit);
+ // ETL strategies will have start=3 (start of first stripe)
+ assertTrue(split.toString().contains("start=3"));
+ assertTrue(split.toString().contains("hasFooter=true"));
+ assertTrue(split.toString().contains("hasBase=true"));
+ assertTrue(split.toString().contains("deltas=0"));
+ if (split instanceof OrcSplit) {
+ assertTrue("Footer serialize test for vector reader, hasFooter is expected in" +
+ " orc splits.", ((OrcSplit) split).hasFooter());
+ }
+ orcInputFormat.getRecordReader(split, jobConf, Reporter.NULL);
+ }
+
+ int readOpsDelta = -1;
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsDelta = statistics.getReadOps() - readOpsBefore;
+ }
+ }
+ // call-1: open to read data - split 1 => mock:/mocktable4/0_0
+ // call-2: open to read data - split 2 => mock:/mocktable4/0_1
+ assertEquals(2, readOpsDelta);
+
+ // revert back to local fs
+ conf.set("fs.defaultFS", "file:///");
+ }
+
+ @Test
+ public void testACIDReaderNoFooterSerialize() throws Exception {
+ MockFileSystem fs = new MockFileSystem(conf);
+ MockPath mockPath = new MockPath(fs, "mock:///mocktable5");
+ conf.set("hive.transactional.table.scan", "true");
+ conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
+ conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
+ conf.set("hive.orc.splits.include.file.footer", "false");
+ conf.set("mapred.input.dir", mockPath.toString());
+ conf.set("fs.defaultFS", "mock:///");
+ conf.set("fs.mock.impl", MockFileSystem.class.getName());
+ StructObjectInspector inspector;
+ synchronized (TestOrcFile.class) {
+ inspector = (StructObjectInspector)
+ ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
+ ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+ Writer writer =
+ OrcFile.createWriter(new Path(mockPath + "/0_0"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 10; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ writer = OrcFile.createWriter(new Path(mockPath + "/0_1"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 10; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ OrcInputFormat orcInputFormat = new OrcInputFormat();
+ InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
+ assertEquals(2, splits.length);
+ int readOpsBefore = -1;
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsBefore = statistics.getReadOps();
+ }
+ }
+ assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
+
+ for (InputSplit split : splits) {
+ assertTrue("OrcSplit is expected", split instanceof OrcSplit);
+ // ETL strategies will have start=3 (start of first stripe)
+ assertTrue(split.toString().contains("start=3"));
+ assertTrue(split.toString().contains("hasFooter=false"));
+ assertTrue(split.toString().contains("hasBase=true"));
+ assertTrue(split.toString().contains("deltas=0"));
+ if (split instanceof OrcSplit) {
+ assertFalse("No footer serialize test for non-vector reader, hasFooter is not expected in" +
+ " orc splits.", ((OrcSplit) split).hasFooter());
+ }
+ orcInputFormat.getRecordReader(split, conf, Reporter.NULL);
+ }
+
+ int readOpsDelta = -1;
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsDelta = statistics.getReadOps() - readOpsBefore;
+ }
+ }
+ // call-1: open to read footer - split 1 => mock:/mocktable5/0_0
+ // call-2: open to read data - split 1 => mock:/mocktable5/0_0
+ // call-3: open to read footer - split 2 => mock:/mocktable5/0_1
+ // call-4: open to read data - split 2 => mock:/mocktable5/0_1
+ assertEquals(4, readOpsDelta);
+
+ // revert back to local fs
+ conf.set("fs.defaultFS", "file:///");
+ }
+
+ @Test
+ public void testACIDReaderFooterSerialize() throws Exception {
+ MockFileSystem fs = new MockFileSystem(conf);
+ MockPath mockPath = new MockPath(fs, "mock:///mocktable6");
+ conf.set("hive.transactional.table.scan", "true");
+ conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
+ conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
+ conf.set("hive.orc.splits.include.file.footer", "true");
+ conf.set("mapred.input.dir", mockPath.toString());
+ conf.set("fs.defaultFS", "mock:///");
+ conf.set("fs.mock.impl", MockFileSystem.class.getName());
+ StructObjectInspector inspector;
+ synchronized (TestOrcFile.class) {
+ inspector = (StructObjectInspector)
+ ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
+ ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+ Writer writer =
+ OrcFile.createWriter(new Path(mockPath + "/0_0"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 10; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ writer = OrcFile.createWriter(new Path(mockPath + "/0_1"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 10; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ OrcInputFormat orcInputFormat = new OrcInputFormat();
+ InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
+ assertEquals(2, splits.length);
+ int readOpsBefore = -1;
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsBefore = statistics.getReadOps();
+ }
+ }
+ assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
+
+ for (InputSplit split : splits) {
+ assertTrue("OrcSplit is expected", split instanceof OrcSplit);
+ // ETL strategies will have start=3 (start of first stripe)
+ assertTrue(split.toString().contains("start=3"));
+ assertTrue(split.toString().contains("hasFooter=true"));
+ assertTrue(split.toString().contains("hasBase=true"));
+ assertTrue(split.toString().contains("deltas=0"));
+ if (split instanceof OrcSplit) {
+ assertTrue("Footer serialize test for ACID reader, hasFooter is expected in" +
+ " orc splits.", ((OrcSplit) split).hasFooter());
+ }
+ orcInputFormat.getRecordReader(split, conf, Reporter.NULL);
+ }
+
+ int readOpsDelta = -1;
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsDelta = statistics.getReadOps() - readOpsBefore;
+ }
+ }
+ // call-1: open to read data - split 1 => mock:/mocktable6/0_0
+ // call-2: open to read data - split 2 => mock:/mocktable6/0_1
+ assertEquals(2, readOpsDelta);
+
+ // revert back to local fs
+ conf.set("fs.defaultFS", "file:///");
+ }
+
+ @Test
+ public void testACIDReaderNoFooterSerializeWithDeltas() throws Exception {
+ MockFileSystem fs = new MockFileSystem(conf);
+ MockPath mockPath = new MockPath(fs, "mock:///mocktable7");
+ conf.set("hive.transactional.table.scan", "true");
+ conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
+ conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
+ conf.set("hive.orc.splits.include.file.footer", "false");
+ conf.set("mapred.input.dir", mockPath.toString());
+ conf.set("fs.defaultFS", "mock:///");
+ conf.set("fs.mock.impl", MockFileSystem.class.getName());
+ StructObjectInspector inspector;
+ synchronized (TestOrcFile.class) {
+ inspector = (StructObjectInspector)
+ ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
+ ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+ Writer writer =
+ OrcFile.createWriter(new Path(mockPath + "/0_0"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 10; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ writer = OrcFile.createWriter(new Path(new Path(mockPath + "/delta_001_002") + "/0_1"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 10; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ OrcInputFormat orcInputFormat = new OrcInputFormat();
+ InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
+ assertEquals(1, splits.length);
+ int readOpsBefore = -1;
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsBefore = statistics.getReadOps();
+ }
+ }
+ assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
+
+ for (InputSplit split : splits) {
+ assertTrue("OrcSplit is expected", split instanceof OrcSplit);
+ // ETL strategies will have start=3 (start of first stripe)
+ assertTrue(split.toString().contains("start=3"));
+ assertTrue(split.toString().contains("hasFooter=false"));
+ assertTrue(split.toString().contains("hasBase=true"));
+ // NOTE: don't be surprised if deltas value is different
+ // in older release deltas=2 as min and max transaction are added separately to delta list.
+ // in newer release since both of them are put together deltas=1
+ assertTrue(split.toString().contains("deltas=1"));
+ if (split instanceof OrcSplit) {
+ assertFalse("No footer serialize test for ACID reader, hasFooter is not expected in" +
+ " orc splits.", ((OrcSplit) split).hasFooter());
+ }
+ orcInputFormat.getRecordReader(split, conf, Reporter.NULL);
+ }
+
+ int readOpsDelta = -1;
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsDelta = statistics.getReadOps() - readOpsBefore;
+ }
+ }
+ // call-1: open to read footer - split 1 => mock:/mocktable7/0_0
+ // call-2: open to read data - split 1 => mock:/mocktable7/0_0
+ // call-3: open side file (flush length) of delta directory
+ // call-4: fs.exists() check for delta_xxx_xxx/bucket_00000 file
+ assertEquals(4, readOpsDelta);
+
+ // revert back to local fs
+ conf.set("fs.defaultFS", "file:///");
+ }
+
+ @Test
+ public void testACIDReaderFooterSerializeWithDeltas() throws Exception {
+ MockFileSystem fs = new MockFileSystem(conf);
+ MockPath mockPath = new MockPath(fs, "mock:///mocktable8");
+ conf.set("hive.transactional.table.scan", "true");
+ conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
+ conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
+ conf.set("hive.orc.splits.include.file.footer", "true");
+ conf.set("mapred.input.dir", mockPath.toString());
+ conf.set("fs.defaultFS", "mock:///");
+ conf.set("fs.mock.impl", MockFileSystem.class.getName());
+ StructObjectInspector inspector;
+ synchronized (TestOrcFile.class) {
+ inspector = (StructObjectInspector)
+ ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
+ ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+ Writer writer =
+ OrcFile.createWriter(new Path(mockPath + "/0_0"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 10; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ writer = OrcFile.createWriter(new Path(new Path(mockPath + "/delta_001_002") + "/0_1"),
+ OrcFile.writerOptions(conf).blockPadding(false)
+ .bufferSize(1024).inspector(inspector));
+ for (int i = 0; i < 10; ++i) {
+ writer.addRow(new MyRow(i, 2 * i));
+ }
+ writer.close();
+
+ OrcInputFormat orcInputFormat = new OrcInputFormat();
+ InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
+ assertEquals(1, splits.length);
+ int readOpsBefore = -1;
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsBefore = statistics.getReadOps();
+ }
+ }
+ assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
+
+ for (InputSplit split : splits) {
+ assertTrue("OrcSplit is expected", split instanceof OrcSplit);
+ // ETL strategies will have start=3 (start of first stripe)
+ assertTrue(split.toString().contains("start=3"));
+ assertTrue(split.toString().contains("hasFooter=true"));
+ assertTrue(split.toString().contains("hasBase=true"));
+ // NOTE: don't be surprised if deltas value is different
+ // in older release deltas=2 as min and max transaction are added separately to delta list.
+ // in newer release since both of them are put together deltas=1
+ assertTrue(split.toString().contains("deltas=1"));
+ if (split instanceof OrcSplit) {
+ assertTrue("Footer serialize test for ACID reader, hasFooter is not expected in" +
+ " orc splits.", ((OrcSplit) split).hasFooter());
+ }
+ orcInputFormat.getRecordReader(split, conf, Reporter.NULL);
+ }
+
+ int readOpsDelta = -1;
+ for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
+ if (statistics.getScheme().equalsIgnoreCase("mock")) {
+ readOpsDelta = statistics.getReadOps() - readOpsBefore;
+ }
+ }
+ // call-1: open to read data - split 1 => mock:/mocktable8/0_0
+ // call-2: open side file (flush length) of delta directory
+ // call-3: fs.exists() check for delta_xxx_xxx/bucket_00000 file
+ assertEquals(3, readOpsDelta);
+
+ // revert back to local fs
+ conf.set("fs.defaultFS", "file:///");
+ }
}