diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 0dea0996c9..eeeb0d7d67 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1984,6 +1984,9 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal " ETL strategy is used when spending little more time in split generation is acceptable" + " (split generation reads and caches file footers). HYBRID chooses between the above strategies" + " based on heuristics."), + HIVE_ORC_BLOB_STORAGE_SPLIT_SIZE("hive.exec.orc.blob.storage.split.size", 128L * 1024 * 1024, + "When blob storage is used, BI split strategy does not have block locations for splitting orc files.\n" + + "In such cases, split generation will use this config to split orc file"), HIVE_ORC_WRITER_LLAP_MEMORY_MANAGER_ENABLED("hive.exec.orc.writer.llap.memory.manager.enabled", true, "Whether orc writers should use llap-aware memory manager. LLAP aware memory manager will use memory\n" + "per executor instead of entire heap memory when concurrent orc writers are involved. This will let\n" + diff --git a/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java b/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java index d0d9759849..4dc04f46fd 100644 --- a/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java +++ b/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java @@ -133,6 +133,10 @@ public URI getUri() { return NAME; } + @Override + public String getScheme() { + return "raw"; + } @Override public FileStatus getFileStatus(Path path) throws IOException { diff --git a/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/mutate/StreamingTestUtils.java b/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/mutate/StreamingTestUtils.java index 63690f9a24..afda7d5c71 100644 --- a/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/mutate/StreamingTestUtils.java +++ b/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/mutate/StreamingTestUtils.java @@ -89,6 +89,11 @@ public URI getUri() { return NAME; } + @Override + public String getScheme() { + return "raw"; + } + @Override public FileStatus getFileStatus(Path path) throws IOException { File file = pathToFile(path); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index ca254492a1..9dac185067 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.io.orc; +import org.apache.hadoop.hive.common.BlobStorageUtils; import org.apache.hadoop.hive.common.NoDynamicValuesException; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.hdfs.DistributedFileSystem; @@ -1074,16 +1075,28 @@ public BISplitStrategy(Context context, FileSystem fs, Path dir, if (fileKey == null && allowSyntheticFileIds) { fileKey = new SyntheticFileId(fileStatus); } - TreeMap blockOffsets = SHIMS.getLocationsWithOffset(fs, fileStatus); - for (Map.Entry entry : blockOffsets.entrySet()) { - if(entry.getKey() + entry.getValue().getLength() > logicalLen) { - //don't create splits for anything past logical EOF - continue; + if (BlobStorageUtils.isBlobStorageFileSystem(conf, fs)) { + final long splitSize = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVE_ORC_BLOB_STORAGE_SPLIT_SIZE); + LOG.info("Blob storage detected for BI split strategy. Splitting files at boundary {}..", splitSize); + long start; + for (start = 0; start < logicalLen; start = start + splitSize) { + OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey, start, + Math.min(splitSize, logicalLen - start), null, null, isOriginal, true, + deltas, -1, logicalLen, dir, offsetAndBucket); + splits.add(orcSplit); } - OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey, entry.getKey(), + } else { + TreeMap blockOffsets = SHIMS.getLocationsWithOffset(fs, fileStatus); + for (Map.Entry entry : blockOffsets.entrySet()) { + if (entry.getKey() + entry.getValue().getLength() > logicalLen) { + //don't create splits for anything past logical EOF + continue; + } + OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey, entry.getKey(), entry.getValue().getLength(), entry.getValue().getHosts(), null, isOriginal, true, deltas, -1, logicalLen, dir, offsetAndBucket); - splits.add(orcSplit); + splits.add(orcSplit); + } } } } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java index 25cd65737b..9a8ae3b0a0 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java @@ -1237,6 +1237,11 @@ public URI getUri() { } } + @Override + public String getScheme() { + return "mock"; + } + // increments file modification time public void touch(MockFile file) { if (fileStatusMap.containsKey(file)) {