diff --git a/common/src/java/org/apache/hadoop/hive/common/FileUtils.java b/common/src/java/org/apache/hadoop/hive/common/FileUtils.java index 651b842f688..61aca56bac8 100644 --- a/common/src/java/org/apache/hadoop/hive/common/FileUtils.java +++ b/common/src/java/org/apache/hadoop/hive/common/FileUtils.java @@ -47,14 +47,13 @@ import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.GlobFilter; import org.apache.hadoop.fs.LocalFileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; +import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.fs.Trash; import org.apache.hadoop.fs.permission.FsAction; import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.conf.HiveConf.ConfVars; -import org.apache.hadoop.hive.conf.HiveConfUtil; -import org.apache.hadoop.hive.io.HdfsUtils; import org.apache.hadoop.hive.shims.HadoopShims; import org.apache.hadoop.hive.shims.ShimLoader; import org.apache.hadoop.hive.shims.Utils; @@ -332,20 +331,48 @@ public static String unescapePathName(String path) { */ public static void listStatusRecursively(FileSystem fs, FileStatus fileStatus, List results) throws IOException { - listStatusRecursively(fs, fileStatus, HIDDEN_FILES_PATH_FILTER, results); + if (isS3a(fs)) { + // S3A file system has an optimized recursive directory listing implementation however it doesn't support filtering. + // Therefore we filter the result set afterwards. This might be not so optimal in HDFS case (which does a tree walking) where a filter could have been used. + listS3FilesRecursive(fileStatus, fs, results); + } else { + generalListStatusRecursively(fs, fileStatus, results); + } } - public static void listStatusRecursively(FileSystem fs, FileStatus fileStatus, - PathFilter filter, List results) throws IOException { + private static void generalListStatusRecursively(FileSystem fs, FileStatus fileStatus, List results) throws IOException { if (fileStatus.isDir()) { - for (FileStatus stat : fs.listStatus(fileStatus.getPath(), filter)) { - listStatusRecursively(fs, stat, results); + for (FileStatus stat : fs.listStatus(fileStatus.getPath(), HIDDEN_FILES_PATH_FILTER)) { + generalListStatusRecursively(fs, stat, results); } } else { results.add(fileStatus); } } + private static void listS3FilesRecursive(FileStatus base, FileSystem fs, List results) throws IOException { + if (!base.isDirectory()) { + results.add(base); + return; + } + RemoteIterator remoteIterator = fs.listFiles(base.getPath(), true); + while (remoteIterator.hasNext()) { + LocatedFileStatus each = remoteIterator.next(); + Path relativePath = new Path(each.getPath().toString().replace(base.toString(), "")); + if (org.apache.hadoop.hive.metastore.utils.FileUtils.RemoteIteratorWithFilter.HIDDEN_FILES_FULL_PATH_FILTER.accept(relativePath)) { + results.add(each); + } + } + } + + public static boolean isS3a(FileSystem fs) { + try { + return "s3a".equalsIgnoreCase(fs.getScheme()); + } catch (UnsupportedOperationException ex) { + return false; + } + } + /** * Find the parent of path that exists, if path does not exist * diff --git a/common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java b/common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java index 09343e56166..7641610e9e5 100644 --- a/common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java +++ b/common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java @@ -19,14 +19,12 @@ import java.io.IOException; import java.util.ArrayList; -import java.util.LinkedList; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.hive.conf.HiveConf; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -56,24 +54,13 @@ */ public static List getFileStatusRecurse(Path path, int level, FileSystem fs) throws IOException { - return getFileStatusRecurse(path, level, fs, FileUtils.HIDDEN_FILES_PATH_FILTER, false); - } - - public static List getFileStatusRecurse( - Path path, int level, FileSystem fs, PathFilter filter) throws IOException { - return getFileStatusRecurse(path, level, fs, filter, false); - } - - public static List getFileStatusRecurse( - Path path, int level, FileSystem fs, PathFilter filter, boolean allLevelsBelow) - throws IOException { // if level is <0, the return all files/directories under the specified path if (level < 0) { List result = new ArrayList(); try { FileStatus fileStatus = fs.getFileStatus(path); - FileUtils.listStatusRecursively(fs, fileStatus, filter, result); + FileUtils.listStatusRecursively(fs, fileStatus, result); } catch (IOException e) { // globStatus() API returns empty FileStatus[] when the specified path // does not exist. But getFileStatus() throw IOException. To mimic the @@ -90,31 +77,7 @@ sb.append(Path.SEPARATOR).append("*"); } Path pathPattern = new Path(path, sb.toString()); - if (!allLevelsBelow) { - return Lists.newArrayList(fs.globStatus(pathPattern, filter)); - } - LinkedList queue = new LinkedList<>(); - List results = new ArrayList(); - for (FileStatus status : fs.globStatus(pathPattern)) { - if (filter.accept(status.getPath())) { - results.add(status); - } - if (status.isDirectory()) { - queue.add(status); - } - } - while (!queue.isEmpty()) { - FileStatus status = queue.poll(); - for (FileStatus child : fs.listStatus(status.getPath())) { - if (filter.accept(child.getPath())) { - results.add(child); - } - if (child.isDirectory()) { - queue.add(child); - } - } - } - return results; + return Lists.newArrayList(fs.globStatus(pathPattern, FileUtils.HIDDEN_FILES_PATH_FILTER)); } public static int getNumBitVectorsForNDVEstimation(Configuration conf) throws Exception { diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/Warehouse.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/Warehouse.java index 38e843aeacf..9ce379bb354 100755 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/Warehouse.java +++ b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/Warehouse.java @@ -690,7 +690,7 @@ public static String makePartName(List partCols, try { Path path = new Path(location); FileSystem fileSys = path.getFileSystem(conf); - return FileUtils.getFileStatusRecurse(path, -1, fileSys); + return FileUtils.getFileStatusRecurse(path, fileSys); } catch (IOException ioe) { MetaStoreUtils.logAndThrowMetaException(ioe); } @@ -708,7 +708,7 @@ public static String makePartName(List partCols, Path tablePath = getDnsPath(new Path(table.getSd().getLocation())); try { FileSystem fileSys = tablePath.getFileSystem(conf); - return FileUtils.getFileStatusRecurse(tablePath, -1, fileSys); + return FileUtils.getFileStatusRecurse(tablePath, fileSys); } catch (IOException ioe) { MetaStoreUtils.logAndThrowMetaException(ioe); } diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/utils/FileUtils.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/utils/FileUtils.java index bf206fffc26..e9342e64181 100644 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/utils/FileUtils.java +++ b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/utils/FileUtils.java @@ -17,6 +17,14 @@ */ package org.apache.hadoop.hive.metastore.utils; +import java.io.IOException; +import java.net.URI; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.Collections; +import java.util.List; +import java.util.NoSuchElementException; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.ContentSummary; import org.apache.hadoop.fs.FileStatus; @@ -33,16 +41,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.common.collect.Lists; - -import java.io.IOException; -import java.net.URI; -import java.util.ArrayList; -import java.util.BitSet; -import java.util.Collections; -import java.util.List; -import java.util.NoSuchElementException; - public class FileUtils { private static final PathFilter SNAPSHOT_DIR_PATH_FILTER = new PathFilter() { @Override @@ -311,41 +309,31 @@ public static String unescapePathName(String path) { /** * Get all file status from a root path and recursively go deep into certain levels. * - * @param path + * @param base * the root path - * @param level - * the depth of directory to explore * @param fs * the file system * @return array of FileStatus * @throws IOException */ - public static List getFileStatusRecurse(Path path, int level, FileSystem fs) - throws IOException { - - // if level is <0, the return all files/directories under the specified path - if (level < 0) { - List result = new ArrayList<>(); - try { - FileStatus fileStatus = fs.getFileStatus(path); - FileUtils.listStatusRecursively(fs, fileStatus, result); - } catch (IOException e) { - // globStatus() API returns empty FileStatus[] when the specified path - // does not exist. But getFileStatus() throw IOException. To mimic the - // similar behavior we will return empty array on exception. For external - // tables, the path of the table will not exists during table creation - return new ArrayList<>(0); + public static List getFileStatusRecurse(Path base, FileSystem fs) { + try { + List results = new ArrayList<>(); + if (isS3a(fs)) { + // S3A file system has an optimized recursive directory listing implementation however it doesn't support filtering. + // Therefore we filter the result set afterwards. This might be not so optimal in HDFS case (which does a tree walking) where a filter could have been used. + listS3FilesRecursive(base, fs, results); + } else { + listStatusRecursively(fs, fs.getFileStatus(base), results); } - return result; - } - - // construct a path pattern (e.g., /*/*) to find all dynamically generated paths - StringBuilder sb = new StringBuilder(path.toUri().getPath()); - for (int i = 0; i < level; i++) { - sb.append(Path.SEPARATOR).append("*"); + return results; + } catch (IOException e) { + // globStatus() API returns empty FileStatus[] when the specified path + // does not exist. But getFileStatus() throw IOException. To mimic the + // similar behavior we will return empty array on exception. For external + // tables, the path of the table will not exists during table creation + return new ArrayList<>(0); } - Path pathPattern = new Path(path, sb.toString()); - return Lists.newArrayList(fs.globStatus(pathPattern, FileUtils.HIDDEN_FILES_PATH_FILTER)); } /** @@ -361,7 +349,7 @@ public static String unescapePathName(String path) { * @param results * receives enumeration of all files found */ - public static void listStatusRecursively(FileSystem fs, FileStatus fileStatus, + private static void listStatusRecursively(FileSystem fs, FileStatus fileStatus, List results) throws IOException { if (fileStatus.isDir()) { @@ -373,6 +361,25 @@ public static void listStatusRecursively(FileSystem fs, FileStatus fileStatus, } } + private static void listS3FilesRecursive(Path base, FileSystem fs, List results) throws IOException { + RemoteIterator remoteIterator = fs.listFiles(base, true); + while (remoteIterator.hasNext()) { + LocatedFileStatus each = remoteIterator.next(); + Path relativePath = new Path(each.getPath().toString().replace(base.toString(), "")); + if (RemoteIteratorWithFilter.HIDDEN_FILES_FULL_PATH_FILTER.accept(relativePath)) { + results.add(each); + } + } + } + + public static boolean isS3a(FileSystem fs) { + try { + return "s3a".equalsIgnoreCase(fs.getScheme()); + } catch (UnsupportedOperationException ex) { + return false; + } + } + public static String makePartName(List partCols, List vals) { return makePartName(partCols, vals, null); }