From afc725d3e562216f1213de92e2487bf0ec30af5c Mon Sep 17 00:00:00 2001 From: stack Date: Fri, 12 Jul 2019 16:18:21 -0700 Subject: [PATCH] HBASE-22688 [HBCK2] Add filesystem fixup to hbck2 Adds a general filesystem check command to the HBCK2 list. Runs just the offline filesystem checks from the old hbck1 HBaseFsck tool. Checks hfile validity, if references and hfile links are wholesome, and whether the hbase.version file is present. Pass '--fix' to do fixup. Depends on hbck1 HBaseFsck being present in hbck2 (HBASE-22680 adds it so this could go in after it) Does NOT do regiondir or tabledir fixup or fixing of orphans in hdfs or plugging holes in hdfs. A hbase-hbck2/src/main/java/org/apache/hbase/FileSystemFsck.java Adds a file to handle filesystem fsck checking and fixing. M hbase-hbck2/src/main/java/org/apache/hbase/hbck1/HFileCorruptionChecker.java Formatting fixes and filling out the MOB report so like the hfile report. --- hbase-hbck2/README.md | 5 + .../java/org/apache/hbase/FileSystemFsck.java | 100 ++++++++++++++++++ .../src/main/java/org/apache/hbase/HBCK2.java | 65 ++++++++---- .../org/apache/hbase/hbck1/HBaseFsck.java | 62 ++++++----- .../hbase/hbck1/HFileCorruptionChecker.java | 28 +++-- 5 files changed, 204 insertions(+), 56 deletions(-) create mode 100644 hbase-hbck2/src/main/java/org/apache/hbase/FileSystemFsck.java diff --git a/hbase-hbck2/README.md b/hbase-hbck2/README.md index 95c5e32..3e8dd66 100644 --- a/hbase-hbck2/README.md +++ b/hbase-hbck2/README.md @@ -384,3 +384,8 @@ Check the master logs. The master should have come up. You’ll see successful c The rebuild of _hbase:meta_ adds the user tables in _DISABLED_ state and the regions in _CLOSED_ mode. Reenable tables via the shell to bring all table regions back online. The rebuild meta will likely be missing edits and may need subsequent repair and cleaning using facility outlined higher up in this README. + +### Dropped reference files, missing hbase.version file, and corrupted hfiles + +HBCK2 can check for hanging references and corrupt hfiles. You can ask it to sideline bad files which may be needed to get over humps where regions won't online or reads are failing. See the _filesystem_ command in the HBCK2 listing. Pass one or more tablename (or 'none' to check all tables). It will report bad files. Pass the _--fix_ option to effect repairs. + diff --git a/hbase-hbck2/src/main/java/org/apache/hbase/FileSystemFsck.java b/hbase-hbck2/src/main/java/org/apache/hbase/FileSystemFsck.java new file mode 100644 index 0000000..b752b0d --- /dev/null +++ b/hbase-hbck2/src/main/java/org/apache/hbase/FileSystemFsck.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hbase; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.master.TableNamespaceManager; +import org.apache.hadoop.hbase.util.FSUtils; +import org.apache.hbase.hbck1.HBaseFsck; +import org.apache.hbase.hbck1.HFileCorruptionChecker; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Collection; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * Checks and repairs for hbase filesystem. + */ +public class FileSystemFsck implements Closeable { + private final Configuration configuration; + private FileSystem fs; + private Path rootDir; + + FileSystemFsck(Configuration conf) throws IOException { + this.configuration = conf; + this.rootDir = FSUtils.getRootDir(this.configuration); + this.fs = rootDir.getFileSystem(this.configuration); + + } + + @Override + public void close() { + // Nothing to do. + } + + int fsck(Options hbck2Options, String[] args) throws IOException { + Options options = new Options(); + Option fixOption = Option.builder("f").longOpt("fix").build(); + options.addOption(fixOption); + // Parse command-line. + CommandLineParser parser = new DefaultParser(); + CommandLine commandLine = null; + try { + commandLine = parser.parse(options, args, false); + } catch(ParseException e) { + HBCK2.usage(hbck2Options, e.getMessage()); + } + boolean fix = commandLine.hasOption(fixOption.getOpt()); + // Before we start make sure of the version file. + if (fix && !HBaseFsck.versionFileExists(this.fs, this.rootDir)) { + HBaseFsck.versionFileCreate(this.configuration, this.fs, this.rootDir); + } + // Iterate over list of tablenames or encoded region names passed. + try (HBaseFsck hbaseFsck = new HBaseFsck(this.configuration)) { + // Check hfiles. + HFileCorruptionChecker hfcc = hbaseFsck.createHFileCorruptionChecker(fix); + hbaseFsck.setHFileCorruptionChecker(hfcc); + Collection tables = commandLine.getArgList(); + Collection tableDirs = tables.isEmpty()? + FSUtils.getTableDirs(this.fs, this.rootDir): + tables.stream().map(t -> FSUtils.getTableDir(this.rootDir, TableName.valueOf(t))). + collect(Collectors.toList()); + hfcc.checkTables(tableDirs); + hfcc.report(hbaseFsck.getErrors()); + // Now check links. + hbaseFsck.setFixReferenceFiles(fix); + hbaseFsck.setFixHFileLinks(fix); + hbaseFsck.setShouldRerun(); + hbaseFsck.offlineHbck(); + } catch (ClassNotFoundException | InterruptedException e) { + throw new IOException(e); + } + return 0; + } +} diff --git a/hbase-hbck2/src/main/java/org/apache/hbase/HBCK2.java b/hbase-hbck2/src/main/java/org/apache/hbase/HBCK2.java index eafb6bb..076fb1a 100644 --- a/hbase-hbck2/src/main/java/org/apache/hbase/HBCK2.java +++ b/hbase-hbck2/src/main/java/org/apache/hbase/HBCK2.java @@ -57,6 +57,7 @@ import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.io.InterruptedIOException; import java.io.PrintWriter; import java.io.StringWriter; import java.util.Arrays; @@ -85,6 +86,7 @@ public class HBCK2 extends Configured implements Tool { private static final String ASSIGNS = "assigns"; private static final String UNASSIGNS = "unassigns"; private static final String BYPASS = "bypass"; + private static final String FILESYSTEM = "filesystem"; private static final String VERSION = "version"; private static final String SET_REGION_STATE = "setRegionState"; private Configuration conf; @@ -296,28 +298,18 @@ public class HBCK2 extends Configured implements Tool { writer.println(" you have is a parent pid to finish parent and children. This"); writer.println(" is SLOW, and dangerous so use selectively. Does not always work."); writer.println(); - writer.println(" " + UNASSIGNS + " ..."); + // out.println(" -checkCorruptHFiles Check all Hfiles by opening them to make sure they are valid"); + // out.println(" -sidelineCorruptHFiles Quarantine corrupted HFiles. implies -checkCorruptHFiles"); + // out.println(" -fixVersionFile Try to fix missing hbase.version file in hdfs."); + // out.println(" -fixReferenceFiles Try to offline lingering reference store files"); + // out.println(" -fixHFileLinks Try to offline lingering HFileLinks"); + writer.println(" " + FILESYSTEM + " [OPTIONS] [ "); - writer.println(" Possible table states: " + Arrays.stream(TableState.State.values()). - map(i -> i.toString()).collect(Collectors.joining(", "))); - writer.println(" To read current table state, in the hbase shell run: "); - writer.println(" hbase> get 'hbase:meta', '', 'table:state'"); - writer.println(" A value of \\x08\\x00 == ENABLED, \\x08\\x01 == DISABLED, etc."); - writer.println(" Can also run a 'describe \"\"' at the shell prompt."); - writer.println(" An example making table name 'user' ENABLED:"); - writer.println(" $ HBCK2 setTableState users ENABLED"); - writer.println(" Returns whatever the previous table state was."); + writer.println(" -f, --fix sideline corrupt hfiles, bad links and references."); + writer.println(" Report corrupt hfiles and broken links. Pass '--fix' to sideline"); + writer.println(" corrupt files and links. Pass one or more tablenames to narrow"); + writer.println(" the checkup. Default checks all tables. Modified regions will"); + writer.println(" need to be reopened to pickup changes."); writer.println(); writer.println(" " + SET_REGION_STATE + " "); writer.println(" Possible region states:"); @@ -336,6 +328,28 @@ public class HBCK2 extends Configured implements Tool { writer.println(" $ HBCK2 setRegionState de00010733901a05f5a2a3a382e27dd4 CLOSING"); writer.println(" Returns \"0\" if region state changed and \"1\" otherwise."); writer.println(); + writer.println(" " + SET_TABLE_STATE + " "); + writer.println(" Possible table states: " + Arrays.stream(TableState.State.values()). + map(i -> i.toString()).collect(Collectors.joining(", "))); + writer.println(" To read current table state, in the hbase shell run: "); + writer.println(" hbase> get 'hbase:meta', '', 'table:state'"); + writer.println(" A value of \\x08\\x00 == ENABLED, \\x08\\x01 == DISABLED, etc."); + writer.println(" Can also run a 'describe \"\"' at the shell prompt."); + writer.println(" An example making table name 'user' ENABLED:"); + writer.println(" $ HBCK2 setTableState users ENABLED"); + writer.println(" Returns whatever the previous table state was."); + writer.println(); + writer.println(" " + UNASSIGNS + " ..."); + writer.println(" Options:"); + writer.println(" -o,--override override ownership by another procedure"); + writer.println(" A 'raw' unassign that can be used even during Master initialization"); + writer.println(" (if the -skip flag is specified). Skirts Coprocessors. Pass one or"); + writer.println(" more encoded region names. 1588230740 is the hard-coded name for"); + writer.println(" the hbase:meta region and de00010733901a05f5a2a3a382e27dd4 is an"); + writer.println(" example of what a userspace encoded region name looks like."); + writer.println(" For example:"); + writer.println(" $ HBCK2 unassign 1588230740 de00010733901a05f5a2a3a382e27dd4"); + writer.println(" Returns the pid(s) of the created UnassignProcedure(s) or -1 if none."); writer.close(); return sw.toString(); } @@ -487,6 +501,15 @@ public class HBCK2 extends Configured implements Tool { return EXIT_FAILURE; } return setRegionState(commands[1], RegionState.State.valueOf(commands[2])); + + case FILESYSTEM: + try (FileSystemFsck fsfsck = new FileSystemFsck(getConf())) { + if (fsfsck.fsck(options, purgeFirst(commands)) != 0) { + return EXIT_FAILURE; + } + } + break; + default: usage(options, "Unsupported command: " + command); return EXIT_FAILURE; diff --git a/hbase-hbck2/src/main/java/org/apache/hbase/hbck1/HBaseFsck.java b/hbase-hbck2/src/main/java/org/apache/hbase/hbck1/HBaseFsck.java index a3ba1b7..e036262 100644 --- a/hbase-hbck2/src/main/java/org/apache/hbase/hbck1/HBaseFsck.java +++ b/hbase-hbck2/src/main/java/org/apache/hbase/hbck1/HBaseFsck.java @@ -677,7 +677,7 @@ public class HBaseFsck extends Configured implements Closeable { /** * Clear the current state of hbck. */ - private void clearState() { + private void clearState() { // Make sure regionInfo is empty before starting fixes = 0; regionInfoMap.clear(); @@ -841,6 +841,13 @@ public class HBaseFsck extends Configured implements Closeable { } } + public void offlineHbck() throws IOException, InterruptedException { + // Do offline check and repair first + offlineHdfsIntegrityRepair(); + offlineReferenceFileRepair(); + offlineHLinkFileRepair(); + } + /** * Contacts the master and prints out cluster-wide information * @return 0 on success, non-zero on failure @@ -853,9 +860,7 @@ public class HBaseFsck extends Configured implements Closeable { // Clean start clearState(); // Do offline check and repair first - offlineHdfsIntegrityRepair(); - offlineReferenceFileRepair(); - offlineHLinkFileRepair(); + offlineHbck(); // If Master runs maintenance tasks (such as balancer, catalog janitor, etc) during online // hbck, it is likely that hbck would be misled and report transient errors. Therefore, it // is better to set Master into maintenance mode during online hbck. @@ -1179,8 +1184,7 @@ public class HBaseFsck extends Configured implements Closeable { FileSystem fs = hbaseRoot.getFileSystem(conf); LOG.info("Computing mapping of all store files"); Map allFiles = FSUtils.getTableStoreFilePathMap(fs, hbaseRoot, - new FSUtils.ReferenceFileFilter(fs), executor, null/* Used to report progress only! errors*/); - errors.print(""); + new FSUtils.ReferenceFileFilter(fs), executor, null/*Used to emit 'progress' and thats it*/); LOG.info("Validating mapping using HDFS state"); for (Path path: allFiles.values()) { Path referredToFile = StoreFileInfo.getReferredToFile(path); @@ -1233,9 +1237,7 @@ public class HBaseFsck extends Configured implements Closeable { LOG.info("Computing mapping of all link files"); Map allFiles = FSUtils .getTableStoreFilePathMap(fs, hbaseRoot, new FSUtils.HFileLinkFilter(), executor, - null /* Used to report progress only! errors*/); - errors.print(""); - + null/*Used to emit 'progress' w/o context.*/); LOG.info("Validating mapping using HDFS state"); for (Path path : allFiles.values()) { // building HFileLink object to gather locations @@ -1851,7 +1853,7 @@ public class HBaseFsck extends Configured implements Closeable { fs.mkdirs(dst); LOG.info("Sidelining files from " + src + " into containing region " + dst); - // FileSystem.rename is inconsistent with directories -- if the + // FileSystemFsck.rename is inconsistent with directories -- if the // dst (foo/a) exists and is a dir, and the src (foo/b) is a dir, // it moves the src into the dst dir resulting in (foo/a/b). If // the dst does not exist, and the src a dir, src becomes dst. (foo/b) @@ -1946,6 +1948,25 @@ public class HBaseFsck extends Configured implements Closeable { .inStates(TableState.State.DISABLED, TableState.State.DISABLING); } + /** + * @return True if the hbase version file exists. + */ + // TODO: Add an hbase.version file integrity check. + public static boolean versionFileExists(FileSystem fs, Path rootDir) throws IOException { + return fs.exists(new Path(rootDir, HConstants.VERSION_FILE_NAME)); + } + + /** + * Create hbase.version file. + */ + public static void versionFileCreate(Configuration configuration, FileSystem fs, Path rootDir) + throws IOException { + FSUtils.setVersion(fs, rootDir, + configuration.getInt(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000), + configuration.getInt(HConstants.VERSION_FILE_WRITE_ATTEMPTS, + HConstants.DEFAULT_VERSION_FILE_WRITE_ATTEMPTS)); + } + /** * Scan HDFS for all regions, recording their information into * regionInfoMap @@ -1958,8 +1979,6 @@ public class HBaseFsck extends Configured implements Closeable { // List all tables from HDFS List tableDirs = Lists.newArrayList(); - boolean foundVersionFile = fs.exists(new Path(rootDir, HConstants.VERSION_FILE_NAME)); - List paths = FSUtils.getTableDirs(fs, rootDir); for (Path path : paths) { TableName tableName = FSUtils.getTableName(path); @@ -1971,17 +1990,12 @@ public class HBaseFsck extends Configured implements Closeable { } // Verify that version file exists - if (!foundVersionFile) { + if (!versionFileExists(fs, rootDir)) { errors.reportError(ErrorReporter.ERROR_CODE.NO_VERSION_FILE, - "Version file does not exist in root dir " + rootDir); + "Version file does not exist under " + rootDir); if (shouldFixVersionFile()) { - LOG.info("Trying to create a new " + HConstants.VERSION_FILE_NAME - + " file."); setShouldRerun(); - FSUtils.setVersion(fs, rootDir, getConf().getInt( - HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000), getConf().getInt( - HConstants.VERSION_FILE_WRITE_ATTEMPTS, - HConstants.DEFAULT_VERSION_FILE_WRITE_ATTEMPTS)); + versionFileCreate(getConf(), fs, rootDir); } } @@ -2854,7 +2868,7 @@ public class HBaseFsck extends Configured implements Closeable { } LOG.info("[" + thread + "] Moving files from " + src + " into containing region " + dst); - // FileSystem.rename is inconsistent with directories -- if the + // FileSystemFsck.rename is inconsistent with directories -- if the // dst (foo/a) exists and is a dir, and the src (foo/b) is a dir, // it moves the src into the dst dir resulting in (foo/a/b). If // the dst does not exist, and the src a dir, src becomes dst. (foo/b) @@ -4678,7 +4692,7 @@ public class HBaseFsck extends Configured implements Closeable { * Display the full report from fsck. This displays all live and dead * region servers, and all known regions. */ - void setShouldRerun() { + public void setShouldRerun() { rerun = true; } @@ -4721,7 +4735,7 @@ public class HBaseFsck extends Configured implements Closeable { checkHdfs = checking; } - boolean shouldCheckHdfs() { + public boolean shouldCheckHdfs() { return checkHdfs; } @@ -4874,7 +4888,7 @@ public class HBaseFsck extends Configured implements Closeable { this.sidelineDir = new Path(sidelineDir); } - protected HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException { + public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException { return new HFileCorruptionChecker(getConf(), executor, sidelineCorruptHFiles); } diff --git a/hbase-hbck2/src/main/java/org/apache/hbase/hbck1/HFileCorruptionChecker.java b/hbase-hbck2/src/main/java/org/apache/hbase/hbck1/HFileCorruptionChecker.java index 983bd75..d473dfd 100644 --- a/hbase-hbck2/src/main/java/org/apache/hbase/hbck1/HFileCorruptionChecker.java +++ b/hbase-hbck2/src/main/java/org/apache/hbase/hbck1/HFileCorruptionChecker.java @@ -525,14 +525,14 @@ public class HFileCorruptionChecker { * @param out */ public void report(HBaseFsck.ErrorReporter out) { - out.print("Checked " + hfilesChecked.get() + " hfile for corruption"); - out.print(" HFiles corrupted: " + corrupted.size()); + out.print("Checked " + hfilesChecked.get() + " hfiles for corruption"); + out.print(" Corrupt HFiles: " + corrupted.size()); if (inQuarantineMode) { - out.print(" HFiles successfully quarantined: " + quarantined.size()); + out.print(" Successfully Quarantined HFiles: " + quarantined.size()); for (Path sq : quarantined) { out.print(" " + sq); } - out.print(" HFiles failed quarantine: " + failures.size()); + out.print(" Failed Quarantine HFiles: " + failures.size()); for (Path fq : failures) { out.print(" " + fq); } @@ -546,18 +546,26 @@ public class HFileCorruptionChecker { String fixedState = (corrupted.size() == quarantined.size()) ? "OK" : "CORRUPTED"; + if (inQuarantineMode) { + out.print("Summary: " + initialState + " => " + fixedState); + } else { + out.print("Summary: " + initialState); + } + // print mob-related report + out.print("Checked " + mobFilesChecked.get() + " MOB files for corruption"); + out.print(" Corrupt MOB files: " + corruptedMobFiles.size()); if (inQuarantineMode) { - out.print(" Mob files successfully quarantined: " + quarantinedMobFiles.size()); + out.print(" Successfully Quarantined MOB files: " + quarantinedMobFiles.size()); for (Path sq : quarantinedMobFiles) { out.print(" " + sq); } - out.print(" Mob files failed quarantine: " + failureMobFiles.size()); + out.print(" Failed Quarantine MOB files: " + failureMobFiles.size()); for (Path fq : failureMobFiles) { out.print(" " + fq); } } - out.print(" Mob files moved while checking: " + missedMobFiles.size()); + out.print(" MOB files moved while checking: " + missedMobFiles.size()); for (Path mq : missedMobFiles) { out.print(" " + mq); } @@ -566,11 +574,9 @@ public class HFileCorruptionChecker { : "CORRUPTED"; if (inQuarantineMode) { - out.print("Summary: " + initialState + " => " + fixedState); - out.print("Mob summary: " + initialMobState + " => " + fixedMobState); + out.print("MOB summary: " + initialMobState + " => " + fixedMobState); } else { - out.print("Summary: " + initialState); - out.print("Mob summary: " + initialMobState); + out.print("MOB summary: " + initialMobState); } } } -- 2.19.1