diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 89171ef..d6992cd 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -3855,6 +3855,9 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal "modification time, which is almost certain to identify file uniquely. However, if you\n" + "use a FS without file IDs and rewrite files a lot (or are paranoid), you might want\n" + "to avoid this setting."), + LLAP_CACHE_FORCE_SYNTHETIC_FILEID("hive.llap.cache.force.synthetic.fileid", false, + "Ignore HDFS file-ids entirely and rely on synthetic file-ids. This is intended at systems\n" + + "which are HDFS protocol based, but allow POSIX mode operations on the same files over NFS"), LLAP_CACHE_DEFAULT_FS_FILE_ID("hive.llap.cache.defaultfs.only.native.fileid", true, "Whether LLAP cache should use native file IDs from the default FS only. This is to\n" + "avoid file ID collisions when several different DFS instances are in use at the same\n" + diff --git llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/OrcEncodedDataReader.java llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/OrcEncodedDataReader.java index b76b0de..ee39179 100644 --- llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/OrcEncodedDataReader.java +++ llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/OrcEncodedDataReader.java @@ -213,7 +213,9 @@ public OrcEncodedDataReader(LowLevelCache lowLevelCache, BufferUsageManager buff fs = split.getPath().getFileSystem(jobConf); fileKey = determineFileId(fs, split, HiveConf.getBoolVar(daemonConf, ConfVars.LLAP_CACHE_ALLOW_SYNTHETIC_FILEID), - HiveConf.getBoolVar(daemonConf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID)); + HiveConf.getBoolVar(daemonConf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID), + HiveConf.getBoolVar(daemonConf, ConfVars.LLAP_CACHE_FORCE_SYNTHETIC_FILEID) + ); fileMetadata = getFileFooterFromCacheOrDisk(); final TypeDescription fileSchema = fileMetadata.getSchema(); @@ -462,7 +464,7 @@ private boolean processStop() { } private static Object determineFileId(FileSystem fs, FileSplit split, - boolean allowSynthetic, boolean checkDefaultFs) throws IOException { + boolean allowSynthetic, boolean checkDefaultFs, boolean forceSynthetic) throws IOException { if (split instanceof OrcSplit) { Object fileKey = ((OrcSplit)split).getFileKey(); if (fileKey != null) { @@ -470,7 +472,7 @@ private static Object determineFileId(FileSystem fs, FileSplit split, } } LOG.warn("Split for " + split.getPath() + " (" + split.getClass() + ") does not have file ID"); - return HdfsUtils.getFileId(fs, split.getPath(), allowSynthetic, checkDefaultFs); + return HdfsUtils.getFileId(fs, split.getPath(), allowSynthetic, checkDefaultFs, forceSynthetic); } /** diff --git llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/SerDeEncodedDataReader.java llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/SerDeEncodedDataReader.java index 5b54af5..0f008a2 100644 --- llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/SerDeEncodedDataReader.java +++ llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/SerDeEncodedDataReader.java @@ -214,7 +214,8 @@ public MemoryBuffer create() { fs = split.getPath().getFileSystem(daemonConf); fileKey = determineFileId(fs, split, HiveConf.getBoolVar(daemonConf, ConfVars.LLAP_CACHE_ALLOW_SYNTHETIC_FILEID), - HiveConf.getBoolVar(daemonConf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID)); + HiveConf.getBoolVar(daemonConf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID), + HiveConf.getBoolVar(daemonConf, ConfVars.LLAP_CACHE_FORCE_SYNTHETIC_FILEID)); cacheTag = HiveConf.getBoolVar(daemonConf, ConfVars.LLAP_TRACK_CACHE_USAGE) ? LlapUtil.getDbAndTableNameForMetrics(split.getPath(), true) : null; this.sourceInputFormat = sourceInputFormat; @@ -1680,12 +1681,12 @@ private boolean processStop() { } private static Object determineFileId(FileSystem fs, FileSplit split, - boolean allowSynthetic, boolean checkDefaultFs) throws IOException { + boolean allowSynthetic, boolean checkDefaultFs, boolean forceSynthetic) throws IOException { /* TODO: support this optionally? this is not OrcSplit, but we could add a custom split. Object fileKey = ((OrcSplit)split).getFileKey(); if (fileKey != null) return fileKey; */ LlapIoImpl.LOG.warn("Split for " + split.getPath() + " (" + split.getClass() + ") does not have file ID"); - return HdfsUtils.getFileId(fs, split.getPath(), allowSynthetic, checkDefaultFs); + return HdfsUtils.getFileId(fs, split.getPath(), allowSynthetic, checkDefaultFs, forceSynthetic); } @Override diff --git ql/src/java/org/apache/hadoop/hive/ql/io/HdfsUtils.java ql/src/java/org/apache/hadoop/hive/ql/io/HdfsUtils.java index 1158b52..3482cfc 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/HdfsUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/HdfsUtils.java @@ -45,8 +45,8 @@ private static final Logger LOG = LoggerFactory.getLogger(HdfsUtils.class); public static Object getFileId(FileSystem fileSystem, Path path, - boolean allowSynthetic, boolean checkDefaultFs) throws IOException { - if (fileSystem instanceof DistributedFileSystem) { + boolean allowSynthetic, boolean checkDefaultFs, boolean forceSyntheticIds) throws IOException { + if (forceSyntheticIds == false && fileSystem instanceof DistributedFileSystem) { DistributedFileSystem dfs = (DistributedFileSystem) fileSystem; if ((!checkDefaultFs) || isDefaultFs(dfs)) { Object result = SHIMS.getFileId(dfs, path.toUri().getPath()); diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index 73c2dcc..b02290e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -2267,8 +2267,11 @@ private static boolean isStripeSatisfyPredicate( boolean checkDefaultFs = HiveConf.getBoolVar( context.conf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID); - boolean isDefaultFs = (!checkDefaultFs) || ((fs instanceof DistributedFileSystem) - && HdfsUtils.isDefaultFs((DistributedFileSystem) fs)); + boolean forceSynthetic = + HiveConf.getBoolVar(context.conf, ConfVars.LLAP_CACHE_FORCE_SYNTHETIC_FILEID); + // if forceSynthetic == true, then assume it is not a defaultFS + boolean isDefaultFs = (forceSynthetic == false) && ((!checkDefaultFs) || ((fs instanceof DistributedFileSystem) + && HdfsUtils.isDefaultFs((DistributedFileSystem) fs))); if (baseFiles.isEmpty()) { assert false : "acid 2.0 no base?!: " + dir; diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java index f64efe2..a65afda 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java @@ -193,7 +193,8 @@ public void initialize( if (metadataCache != null) { cacheKey = HdfsUtils.getFileId(file.getFileSystem(configuration), file, HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_ALLOW_SYNTHETIC_FILEID), - HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID)); + HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID), + HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_FORCE_SYNTHETIC_FILEID)); } if (cacheKey != null) { if (HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_TRACK_CACHE_USAGE)) {