diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java index a164b12..489997c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java @@ -310,7 +310,7 @@ void findUnknownPartitions(Table table, Set partPaths, // now check the table folder and see if we find anything // that isn't in the metastore Set allPartDirs = new HashSet(); - getAllLeafDirs(tablePath, allPartDirs); + getLastPartitionDirs(tablePath, allPartDirs, table.getPartCols().size()); // don't want the table dir allPartDirs.remove(tablePath); @@ -367,12 +367,14 @@ private String getPartitionName(Path tablePath, Path partitionPath) { * Start directory * @param allDirs * This set will contain the leaf paths at the end. + * @param maxDepth + * Specify how deep the search goes. * @throws IOException * Thrown if we can't get lists from the fs. * @throws HiveException */ - private void getAllLeafDirs(Path basePath, Set allDirs) throws IOException, HiveException { + private void getLastPartitionDirs(Path basePath, Set allDirs, int maxDepth) throws IOException, HiveException { ConcurrentLinkedQueue basePaths = new ConcurrentLinkedQueue<>(); basePaths.add(basePath); // we only use the keySet of ConcurrentHashMap @@ -390,33 +392,47 @@ private void getAllLeafDirs(Path basePath, Set allDirs) throws IOException LOG.debug("Using threaded version of MSCK-GetPaths with number of threads " + ((ThreadPoolExecutor) pool).getPoolSize()); } - getAllLeafDirs(pool, basePaths, dirSet, basePath.getFileSystem(conf)); + getLastPartitionDirs(pool, basePaths, dirSet, basePath.getFileSystem(conf), maxDepth); pool.shutdown(); allDirs.addAll(dirSet.keySet()); } // process the basePaths in parallel and then the next level of basePaths - private void getAllLeafDirs(final ExecutorService pool, final ConcurrentLinkedQueue basePaths, - final Map allDirs, final FileSystem fs) throws IOException, HiveException { + private void getLastPartitionDirs(final ExecutorService pool, final ConcurrentLinkedQueue basePaths, + final Map allDirs, final FileSystem fs, final int maxDepth) throws IOException, HiveException { final ConcurrentLinkedQueue nextLevel = new ConcurrentLinkedQueue<>(); if (null == pool) { for (final Path path : basePaths) { FileStatus[] statuses = fs.listStatus(path, FileUtils.HIDDEN_FILES_PATH_FILTER); - boolean directoryFound = false; + boolean fileFound = false; for (FileStatus status : statuses) { - if (status.isDir()) { - directoryFound = true; + if (status.isDirectory()) { nextLevel.add(status.getPath()); + } else { + fileFound = true; } } - if (!directoryFound) { - // true is just a boolean object place holder because neither the key nor the value can be null. + if (maxDepth != 0) { + // we are in the middle of the search and we find a file + if (fileFound) { + if ("throw".equals(HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_MSCK_PATH_VALIDATION))) { + throw new HiveException( + "MSCK finds a file rather than a folder when it searches for " + path.toString()); + } else { + LOG.warn("MSCK finds a file rather than a folder when it searches for " + + path.toString()); + } + } + if (!nextLevel.isEmpty()) { + getLastPartitionDirs(pool, nextLevel, allDirs, fs, maxDepth - 1); + } + } else { + // we are at the final level. + // true is just a boolean object place holder because neither the + // key nor the value can be null. allDirs.put(path, true); } - if (!nextLevel.isEmpty()) { - getAllLeafDirs(pool, nextLevel, allDirs, fs); - } } } else { final List> futures = new LinkedList<>(); @@ -425,16 +441,32 @@ private void getAllLeafDirs(final ExecutorService pool, final ConcurrentLinkedQu @Override public Void call() throws Exception { FileStatus[] statuses = fs.listStatus(path, FileUtils.HIDDEN_FILES_PATH_FILTER); - boolean directoryFound = false; - + boolean fileFound = false; for (FileStatus status : statuses) { - if (status.isDir()) { - directoryFound = true; + if (status.isDirectory()) { nextLevel.add(status.getPath()); + } else { + fileFound = true; } } - - if (!directoryFound) { + if (maxDepth != 0) { + // we are in the middle of the search and we find a file + if (fileFound) { + if ("throw".equals(HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_MSCK_PATH_VALIDATION))) { + throw new HiveException( + "MSCK finds a file rather than a folder when it searches for " + path.toString()); + } else { + LOG.warn("MSCK finds a file rather than a folder when it searches for " + + path.toString()); + } + } + if (!nextLevel.isEmpty()) { + getLastPartitionDirs(pool, nextLevel, allDirs, fs, maxDepth - 1); + } + } else { + // we are at the final level. + // true is just a boolean object place holder because neither the + // key nor the value can be null. allDirs.put(path, true); } return null; @@ -450,10 +482,9 @@ public Void call() throws Exception { throw new HiveException(e.getCause()); } } - if (!nextLevel.isEmpty()) { - getAllLeafDirs(pool, nextLevel, allDirs, fs); + if (!nextLevel.isEmpty() && maxDepth != 0) { + getLastPartitionDirs(pool, nextLevel, allDirs, fs, maxDepth - 1); } } } - } diff --git a/ql/src/test/queries/clientnegative/msck_repair_1.q b/ql/src/test/queries/clientnegative/msck_repair_1.q new file mode 100644 index 0000000..28cce2e --- /dev/null +++ b/ql/src/test/queries/clientnegative/msck_repair_1.q @@ -0,0 +1,19 @@ +set hive.msck.repair.batch.size=1; + +DROP TABLE IF EXISTS repairtable; + +CREATE TABLE repairtable(col STRING) PARTITIONED BY (p1 STRING, p2 STRING); + +MSCK TABLE repairtable; + +dfs ${system:test.dfs.mkdir} ${system:test.warehouse.dir}/repairtable/p1=c/p2=a/p3=b; +dfs -touchz ${system:test.warehouse.dir}/repairtable/p1=c/p2=a/p3=b/datafile; +dfs -touchz ${system:test.warehouse.dir}/repairtable/p1=c/datafile; + +MSCK TABLE default.repairtable; + +MSCK REPAIR TABLE default.repairtable; + +MSCK TABLE repairtable; + +DROP TABLE default.repairtable; diff --git a/ql/src/test/queries/clientpositive/msck_repair_1.q b/ql/src/test/queries/clientpositive/msck_repair_1.q new file mode 100644 index 0000000..ea596cb --- /dev/null +++ b/ql/src/test/queries/clientpositive/msck_repair_1.q @@ -0,0 +1,18 @@ +set hive.msck.repair.batch.size=1; + +DROP TABLE IF EXISTS repairtable; + +CREATE TABLE repairtable(col STRING) PARTITIONED BY (p1 STRING, p2 STRING); + +MSCK TABLE repairtable; + +dfs ${system:test.dfs.mkdir} ${system:test.warehouse.dir}/repairtable/p1=c/p2=a/p3=b; +dfs -touchz ${system:test.warehouse.dir}/repairtable/p1=c/p2=a/p3=b/datafile; + +MSCK TABLE default.repairtable; + +MSCK REPAIR TABLE default.repairtable; + +MSCK TABLE repairtable; + +DROP TABLE default.repairtable; diff --git a/ql/src/test/queries/clientpositive/msck_repair_2.q b/ql/src/test/queries/clientpositive/msck_repair_2.q new file mode 100644 index 0000000..d833821 --- /dev/null +++ b/ql/src/test/queries/clientpositive/msck_repair_2.q @@ -0,0 +1,20 @@ +set hive.msck.repair.batch.size=1; +set hive.msck.path.validation=skip; + +DROP TABLE IF EXISTS repairtable; + +CREATE TABLE repairtable(col STRING) PARTITIONED BY (p1 STRING, p2 STRING); + +MSCK TABLE repairtable; + +dfs ${system:test.dfs.mkdir} ${system:test.warehouse.dir}/repairtable/p1=c/p2=a/p3=b; +dfs -touchz ${system:test.warehouse.dir}/repairtable/p1=c/p2=a/p3=b/datafile; +dfs -touchz ${system:test.warehouse.dir}/repairtable/p1=c/datafile; + +MSCK TABLE default.repairtable; + +MSCK REPAIR TABLE default.repairtable; + +MSCK TABLE repairtable; + +DROP TABLE default.repairtable; diff --git a/ql/src/test/results/clientnegative/msck_repair_1.q.out b/ql/src/test/results/clientnegative/msck_repair_1.q.out new file mode 100644 index 0000000..c5f644d --- /dev/null +++ b/ql/src/test/results/clientnegative/msck_repair_1.q.out @@ -0,0 +1,19 @@ +PREHOOK: query: DROP TABLE IF EXISTS repairtable +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS repairtable +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE repairtable(col STRING) PARTITIONED BY (p1 STRING, p2 STRING) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@repairtable +POSTHOOK: query: CREATE TABLE repairtable(col STRING) PARTITIONED BY (p1 STRING, p2 STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@repairtable +PREHOOK: query: MSCK TABLE repairtable +PREHOOK: type: MSCK +POSTHOOK: query: MSCK TABLE repairtable +POSTHOOK: type: MSCK +PREHOOK: query: MSCK TABLE default.repairtable +PREHOOK: type: MSCK +FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask diff --git a/ql/src/test/results/clientpositive/msck_repair_1.q.out b/ql/src/test/results/clientpositive/msck_repair_1.q.out new file mode 100644 index 0000000..c394f9b --- /dev/null +++ b/ql/src/test/results/clientpositive/msck_repair_1.q.out @@ -0,0 +1,39 @@ +PREHOOK: query: DROP TABLE IF EXISTS repairtable +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS repairtable +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE repairtable(col STRING) PARTITIONED BY (p1 STRING, p2 STRING) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@repairtable +POSTHOOK: query: CREATE TABLE repairtable(col STRING) PARTITIONED BY (p1 STRING, p2 STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@repairtable +PREHOOK: query: MSCK TABLE repairtable +PREHOOK: type: MSCK +POSTHOOK: query: MSCK TABLE repairtable +POSTHOOK: type: MSCK +PREHOOK: query: MSCK TABLE default.repairtable +PREHOOK: type: MSCK +POSTHOOK: query: MSCK TABLE default.repairtable +POSTHOOK: type: MSCK +Partitions not in metastore: repairtable:p1=c/p2=a +PREHOOK: query: MSCK REPAIR TABLE default.repairtable +PREHOOK: type: MSCK +POSTHOOK: query: MSCK REPAIR TABLE default.repairtable +POSTHOOK: type: MSCK +Partitions not in metastore: repairtable:p1=c/p2=a +Repair: Added partition to metastore default.repairtable:p1=c/p2=a +PREHOOK: query: MSCK TABLE repairtable +PREHOOK: type: MSCK +POSTHOOK: query: MSCK TABLE repairtable +POSTHOOK: type: MSCK +PREHOOK: query: DROP TABLE default.repairtable +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@repairtable +PREHOOK: Output: default@repairtable +POSTHOOK: query: DROP TABLE default.repairtable +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@repairtable +POSTHOOK: Output: default@repairtable diff --git a/ql/src/test/results/clientpositive/msck_repair_2.q.out b/ql/src/test/results/clientpositive/msck_repair_2.q.out new file mode 100644 index 0000000..c394f9b --- /dev/null +++ b/ql/src/test/results/clientpositive/msck_repair_2.q.out @@ -0,0 +1,39 @@ +PREHOOK: query: DROP TABLE IF EXISTS repairtable +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS repairtable +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE repairtable(col STRING) PARTITIONED BY (p1 STRING, p2 STRING) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@repairtable +POSTHOOK: query: CREATE TABLE repairtable(col STRING) PARTITIONED BY (p1 STRING, p2 STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@repairtable +PREHOOK: query: MSCK TABLE repairtable +PREHOOK: type: MSCK +POSTHOOK: query: MSCK TABLE repairtable +POSTHOOK: type: MSCK +PREHOOK: query: MSCK TABLE default.repairtable +PREHOOK: type: MSCK +POSTHOOK: query: MSCK TABLE default.repairtable +POSTHOOK: type: MSCK +Partitions not in metastore: repairtable:p1=c/p2=a +PREHOOK: query: MSCK REPAIR TABLE default.repairtable +PREHOOK: type: MSCK +POSTHOOK: query: MSCK REPAIR TABLE default.repairtable +POSTHOOK: type: MSCK +Partitions not in metastore: repairtable:p1=c/p2=a +Repair: Added partition to metastore default.repairtable:p1=c/p2=a +PREHOOK: query: MSCK TABLE repairtable +PREHOOK: type: MSCK +POSTHOOK: query: MSCK TABLE repairtable +POSTHOOK: type: MSCK +PREHOOK: query: DROP TABLE default.repairtable +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@repairtable +PREHOOK: Output: default@repairtable +POSTHOOK: query: DROP TABLE default.repairtable +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@repairtable +POSTHOOK: Output: default@repairtable