Index: hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java =================================================================== --- hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java (revision 1486505) +++ hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java (working copy) @@ -390,16 +390,30 @@ InterruptedException { clearState(); - LOG.info("Loading regionsinfo from the .META. table"); - boolean success = loadMetaEntries(); - if (!success) return -1; - + // get regions according to what is online on each RegionServer + loadDeployedRegions(); + // check wether .META. is deployed and online + if (!recordMetaRegion()) { + // Will remove later if we can fix it + errors.reportError("Fatal error: unable to get .META. region location. Exiting..."); + return -2; + } // Check if .META. is found only once and in the right place if (!checkMetaRegion()) { // Will remove later if we can fix it - errors.reportError("Encountered fatal error. Exiting..."); + String errorMsg = ".META. table is not consistent."; + if (shouldFixAssignments()) { + errorMsg += "HBCK will try fixing it. Rerun once .META. is back to consistent state."; + } else { + errorMsg += "Run HBCK with proper fix options to fix .META. "; + } + errors.reportError(errorMsg + "Exiting..."); return -2; } + // Not going ahead further with the consistency check for tables when META itself is not consistent. + LOG.info("Loading regionsinfo from the .META. table"); + boolean success = loadMetaEntries(); + if (!success) return -1; // Empty cells in .META.? reportEmptyMetaCells(); @@ -414,9 +428,6 @@ reportTablesInFlux(); } - // get regions according to what is online on each RegionServer - loadDeployedRegions(); - // load regiondirs and regioninfos from HDFS if (shouldCheckHdfs()) { loadHdfsRegionDirs(); @@ -1335,10 +1346,13 @@ } catch (KeeperException e) { throw new IOException(e); } - MetaEntry m = - new MetaEntry(metaLocation.getRegionInfo(), sn, System.currentTimeMillis()); - HbckInfo hbInfo = new HbckInfo(m); - regionInfoMap.put(metaLocation.getRegionInfo().getEncodedName(), hbInfo); + MetaEntry m = new MetaEntry(metaLocation.getRegionInfo(), sn, System.currentTimeMillis()); + HbckInfo hbckInfo = regionInfoMap.get(metaLocation.getRegionInfo().getEncodedName()); + if (hbckInfo == null) { + regionInfoMap.put(metaLocation.getRegionInfo().getEncodedName(), new HbckInfo(m)); + } else { + hbckInfo.metaEntry = m; + } return true; } @@ -2493,45 +2507,41 @@ * @throws KeeperException * @throws InterruptedException */ - boolean checkMetaRegion() - throws IOException, KeeperException, InterruptedException { - List metaRegions = Lists.newArrayList(); + boolean checkMetaRegion() throws IOException, KeeperException, InterruptedException { + List metaRegions = Lists.newArrayList(); for (HbckInfo value : regionInfoMap.values()) { - if (value.metaEntry.isMetaRegion()) { + if (value.metaEntry != null && value.metaEntry.isMetaRegion()) { metaRegions.add(value); } } - // If something is wrong + // There will be always one entry in regionInfoMap corresponding to .META. + // Check the deployed servers. It should be exactly one server. if (metaRegions.size() != 1) { - HRegionLocation rootLocation = connection.locateRegion( - HConstants.ROOT_TABLE_NAME, HConstants.EMPTY_START_ROW); - HbckInfo root = - regionInfoMap.get(rootLocation.getRegionInfo().getEncodedName()); - - // If there is no region holding .META. - if (metaRegions.size() == 0) { + errors.reportError(ERROR_CODE.NULL_META_REGION, + "META region or some of its attributes are null."); + return false; + } + HbckInfo metaHbckInfo = metaRegions.get(0); + List servers = metaHbckInfo.deployedOn; + if (servers.size() != 1) { + if (servers.size() == 0) { errors.reportError(ERROR_CODE.NO_META_REGION, ".META. is not found on any region."); if (shouldFixAssignments()) { errors.print("Trying to fix a problem with .META..."); setShouldRerun(); // try to fix it (treat it as unassigned region) - HBaseFsckRepair.fixUnassigned(admin, root.metaEntry); - HBaseFsckRepair.waitUntilAssigned(admin, root.getHdfsHRI()); + HBaseFsckRepair.fixUnassigned(admin, metaHbckInfo.metaEntry); + HBaseFsckRepair.waitUntilAssigned(admin, metaHbckInfo.metaEntry); } - } - // If there are more than one regions pretending to hold the .META. - else if (metaRegions.size() > 1) { - errors.reportError(ERROR_CODE.MULTI_META_REGION, ".META. is found on more than one region."); + } else if (servers.size() > 1) { + errors + .reportError(ERROR_CODE.MULTI_META_REGION, ".META. is found on more than one region."); if (shouldFixAssignments()) { errors.print("Trying to fix a problem with .META..."); setShouldRerun(); // try fix it (treat is a dupe assignment) - List deployedOn = Lists.newArrayList(); - for (HbckInfo mRegion : metaRegions) { - deployedOn.add(mRegion.metaEntry.regionServer); - } - HBaseFsckRepair.fixMultiAssignment(admin, root.metaEntry, deployedOn); + HBaseFsckRepair.fixMultiAssignment(admin, metaHbckInfo.metaEntry, servers); } } // rerun hbck with hopefully fixed META @@ -2546,15 +2556,6 @@ * @throws IOException if an error is encountered */ boolean loadMetaEntries() throws IOException { - - // get a list of all regions from the master. This involves - // scanning the META table - if (!recordMetaRegion()) { - // Will remove later if we can fix it - errors.reportError("Fatal error: unable to get root region location. Exiting..."); - return false; - } - MetaScannerVisitor visitor = new MetaScannerVisitorBase() { int countRecord = 1; @@ -2588,9 +2589,12 @@ } PairOfSameType daughters = HRegionInfo.getDaughterRegions(result); MetaEntry m = new MetaEntry(hri, sn, ts, daughters.getFirst(), daughters.getSecond()); - HbckInfo hbInfo = new HbckInfo(m); - HbckInfo previous = regionInfoMap.put(hri.getEncodedName(), hbInfo); - if (previous != null) { + HbckInfo previous = regionInfoMap.get(hri.getEncodedName()); + if (previous == null) { + regionInfoMap.put(hri.getEncodedName(), new HbckInfo(m)); + } else if (previous.metaEntry == null) { + previous.metaEntry = m; + } else { throw new IOException("Two entries in META are same " + previous); } Index: hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java =================================================================== --- hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java (revision 1486505) +++ hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java (working copy) @@ -48,6 +48,7 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.Abortable; import org.apache.hadoop.hbase.ClusterStatus; import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.HColumnDescriptor; @@ -63,6 +64,7 @@ import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.HBaseAdmin; import org.apache.hadoop.hbase.client.HConnection; +import org.apache.hadoop.hbase.client.HConnectionManager; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Result; @@ -88,6 +90,9 @@ import org.apache.hadoop.hbase.util.HBaseFsck.TableInfo; import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker; import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil; +import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker; +import org.apache.hadoop.hbase.zookeeper.ZKUtil; +import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; import org.apache.zookeeper.KeeperException; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -232,7 +237,7 @@ HRegionInfo hri) throws IOException, InterruptedException { try { HBaseFsckRepair.closeRegionSilentlyAndWait(admin, sn, hri); - admin.offline(hri.getRegionName()); + if(!hri.isMetaTable()) admin.offline(hri.getRegionName()); } catch (IOException ioe) { LOG.warn("Got exception when attempting to offline region " + Bytes.toString(hri.getRegionName()), ioe); @@ -2003,6 +2008,57 @@ writeLock.release(); // release for clean state } + @Test + public void testMetaOffline() throws Exception { + // check no errors + HBaseFsck hbck = doFsck(conf, false); + assertNoErrors(hbck); + deleteMetaRegion(conf, true, false, false); + hbck = doFsck(conf, false); + // ERROR_CODE.UNKNOWN is coming because we reportError with a message for the .META. + // inconsistency and whether we will be fixing it or not. + assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN }); + hbck = doFsck(conf, true); + assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN }); + hbck = doFsck(conf, true); + assertNoErrors(hbck); + } + + private void deleteMetaRegion(Configuration conf, boolean unassign, boolean hdfs, + boolean regionInfoOnly) throws IOException, InterruptedException { + HConnection connection = HConnectionManager.getConnection(conf); + HRegionLocation metaLocation = connection.locateRegion(HConstants.META_TABLE_NAME, + HConstants.EMPTY_START_ROW); + ServerName hsa = new ServerName(metaLocation.getHostnamePort(), 0L); + HRegionInfo hri = metaLocation.getRegionInfo(); + if (unassign) { + LOG.info("Undeploying meta region " + hri + " from server " + hsa); + undeployRegion(new HBaseAdmin(conf), hsa, hri); + } + + if (regionInfoOnly) { + LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString()); + Path rootDir = FSUtils.getRootDir(conf); + FileSystem fs = rootDir.getFileSystem(conf); + Path p = new Path(rootDir + "/" + HTableDescriptor.META_TABLEDESC.getNameAsString(), + hri.getEncodedName()); + Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE); + fs.delete(hriPath, true); + } + + if (hdfs) { + LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString()); + Path rootDir = FSUtils.getRootDir(conf); + FileSystem fs = rootDir.getFileSystem(conf); + Path p = new Path(rootDir + "/" + HTableDescriptor.META_TABLEDESC.getNameAsString(), + hri.getEncodedName()); + HBaseFsck.debugLsr(conf, p); + boolean success = fs.delete(p, true); + LOG.info("Deleted " + p + " sucessfully? " + success); + HBaseFsck.debugLsr(conf, p); + } + } + @org.junit.Rule public TestName name = new TestName(); }