diff --git hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKTable.java hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKTable.java index 0676cfe..1581439 100644 --- hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKTable.java +++ hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKTable.java @@ -148,7 +148,20 @@ public class ZKTable { throws KeeperException { synchronized (this.cache) { if (isEnablingOrEnabledTable(tableName)) { - return false; + // If the table is in the one of the states from the states list, the cache + // might be out-of-date, try to find it out from the master source (zookeeper server). + // + // Note: this adds extra zookeeper server calls and might have performance impact. + // However, this is not the happy path so we should not reach here often. Therefore, + // the performance impact should be minimal to none. + ZooKeeperProtos.Table.State currentState = + ZKTableReadOnly.getTableState(this.watcher, tableName); + + if (currentState == null || + currentState == ZooKeeperProtos.Table.State.ENABLING || + currentState == ZooKeeperProtos.Table.State.ENABLED) { + return false; + } } setTableState(tableName, ZooKeeperProtos.Table.State.ENABLING); return true; diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java index dd2ac41..4d21043 100644 --- hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java +++ hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java @@ -95,6 +95,7 @@ import org.apache.hadoop.hbase.master.MasterFileSystem; import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService.BlockingInterface; +import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos; import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.HRegionFileSystem; import org.apache.hadoop.hbase.regionserver.StoreFileInfo; @@ -107,6 +108,7 @@ import org.apache.hadoop.hbase.util.hbck.TableIntegrityErrorHandler; import org.apache.hadoop.hbase.util.hbck.TableIntegrityErrorHandlerImpl; import org.apache.hadoop.hbase.util.hbck.TableLockChecker; import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker; +import org.apache.hadoop.hbase.zookeeper.ZKTable; import org.apache.hadoop.hbase.zookeeper.ZKTableReadOnly; import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; import org.apache.hadoop.hbase.security.AccessDeniedException; @@ -222,6 +224,7 @@ public class HBaseFsck extends Configured { private boolean fixReferenceFiles = false; // fix lingering reference store file private boolean fixEmptyMetaCells = false; // fix (remove) empty REGIONINFO_QUALIFIER rows private boolean fixTableLocks = false; // fix table locks which are expired + private boolean fixTableZNodes = false; // fix table Znodes which are orphaned private boolean fixAny = false; // Set to true if any of the fix is required. // limit checking/fixes to listed tables, if empty attempt to check/fix all @@ -277,6 +280,11 @@ public class HBaseFsck extends Configured { new HashMap>(); /** + * List of orphaned table ZNodes + */ + private Set orphanedTableZNodes = new HashSet(); + + /** * Constructor * * @param conf Configuration object @@ -584,6 +592,9 @@ public class HBaseFsck extends Configured { checkAndFixTableLocks(); + // Check (and fix if requested) orphaned table ZNodes + checkAndFixOrphanedTableZNodes(); + // Remove the hbck lock unlockHbck(); @@ -2876,15 +2887,64 @@ public class HBaseFsck extends Configured { } private void checkAndFixTableLocks() throws IOException { - TableLockChecker checker = new TableLockChecker(createZooKeeperWatcher(), errors); - checker.checkTableLocks(); + ZooKeeperWatcher zkw = createZooKeeperWatcher(); - if (this.fixTableLocks) { - checker.fixExpiredTableLocks(); + try { + TableLockChecker checker = new TableLockChecker(createZooKeeperWatcher(), errors); + checker.checkTableLocks(); + + if (this.fixTableLocks) { + checker.fixExpiredTableLocks(); + } + } finally { + zkw.close(); } } /** + * Check whether a orphaned table ZNode exists and fix it if requested. + * @throws IOException + * @throws KeeperException + * @throws InterruptedException + */ + private void checkAndFixOrphanedTableZNodes() + throws IOException, KeeperException, InterruptedException { + ZooKeeperWatcher zkw = createZooKeeperWatcher(); + try { + ZKTable zkTable = new ZKTable(zkw); + Set enablingTables = zkTable.getEnablingTables(zkw); + String msg; + TableInfo tableInfo; + + for (TableName tableName : enablingTables) { + // Check whether the table exists in hbase + tableInfo = tablesInfo.get(tableName); + if (tableInfo != null) { + // Table exists. This table state is in transit. No problem for this table. + continue; + } + + msg = "Table " + tableName + " not found in hbase:meta. Orphaned table ZNode found."; + LOG.warn(msg); + orphanedTableZNodes.add(tableName); + errors.reportError(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY, msg); + } + + if (orphanedTableZNodes.size() > 0 && this.fixTableZNodes) { + for (TableName tableName : orphanedTableZNodes) { + // Set the table state to be disabled so that if we made mistake, we can trace + // the history and figure it out. + // Another choice is to call checkAndRemoveTableState() to delete the orphaned ZNode. + // Both approaches works. + zkTable.setDisabledTable(tableName); + } + } + } finally { + zkw.close(); + } + } + + /** * Check values in regionInfo for hbase:meta * Check if zero or more than one regions with hbase:meta are found. * If there are inconsistencies (i.e. zero or more than one regions @@ -3327,7 +3387,7 @@ public class HBaseFsck extends Configured { FIRST_REGION_STARTKEY_NOT_EMPTY, LAST_REGION_ENDKEY_NOT_EMPTY, DUPE_STARTKEYS, HOLE_IN_REGION_CHAIN, OVERLAP_IN_REGION_CHAIN, REGION_CYCLE, DEGENERATE_REGION, ORPHAN_HDFS_REGION, LINGERING_SPLIT_PARENT, NO_TABLEINFO_FILE, LINGERING_REFERENCE_HFILE, - WRONG_USAGE, EMPTY_META_CELL, EXPIRED_TABLE_LOCK, BOUNDARIES_ERROR + WRONG_USAGE, EMPTY_META_CELL, EXPIRED_TABLE_LOCK, ORPHANED_ZK_TABLE_ENTRY, BOUNDARIES_ERROR } void clear(); void report(String message); @@ -3695,6 +3755,15 @@ public class HBaseFsck extends Configured { } /** + * Set orphaned table ZNodes fix mode. + * Set the table state to disable in the orphaned table ZNode. + */ + public void setFixTableZNodes(boolean shouldFix) { + fixTableZNodes = shouldFix; + fixAny |= shouldFix; + } + + /** * Check if we should rerun fsck again. This checks if we've tried to * fix something and we should rerun fsck tool again. * Display the full report from fsck. This displays all live and dead @@ -3944,13 +4013,18 @@ public class HBaseFsck extends Configured { out.println(""); out.println(" Metadata Repair shortcuts"); out.println(" -repair Shortcut for -fixAssignments -fixMeta -fixHdfsHoles " + - "-fixHdfsOrphans -fixHdfsOverlaps -fixVersionFile -sidelineBigOverlaps -fixReferenceFiles -fixTableLocks"); + "-fixHdfsOrphans -fixHdfsOverlaps -fixVersionFile -sidelineBigOverlaps " + + "-fixReferenceFiles -fixTableLocks -fixOrphanedTableZnodes"); out.println(" -repairHoles Shortcut for -fixAssignments -fixMeta -fixHdfsHoles"); out.println(""); out.println(" Table lock options"); out.println(" -fixTableLocks Deletes table locks held for a long time (hbase.table.lock.expire.ms, 10min by default)"); + out.println(""); + out.println(" Table Znode options"); + out.println(" -fixOrphanedTableZnodes Set table state in ZNode to disabled if table does not exists"); + out.flush(); errors.reportError(ERROR_CODE.WRONG_USAGE, sw.toString()); @@ -4083,6 +4157,7 @@ public class HBaseFsck extends Configured { setCheckHdfs(true); setFixReferenceFiles(true); setFixTableLocks(true); + setFixTableZNodes(true); } else if (cmd.equals("-repairHoles")) { // this will make all missing hdfs regions available but may lose data setFixHdfsHoles(true); @@ -4131,6 +4206,8 @@ public class HBaseFsck extends Configured { setRegionBoundariesCheck(); } else if (cmd.equals("-fixTableLocks")) { setFixTableLocks(true); + } else if (cmd.equals("-fixOrphanedTableZnodes")) { + setFixTableZNodes(true); } else if (cmd.startsWith("-")) { errors.reportError(ERROR_CODE.WRONG_USAGE, "Unrecognized option:" + cmd); return printUsageAndExit(); diff --git hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java index a4084ea..e233302 100644 --- hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java +++ hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java @@ -61,6 +61,7 @@ import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HRegionLocation; import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.TableExistsException; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.MiniHBaseCluster; import org.apache.hadoop.hbase.ServerName; @@ -87,6 +88,7 @@ import org.apache.hadoop.hbase.master.TableLockManager; import org.apache.hadoop.hbase.master.TableLockManager.TableLock; import org.apache.hadoop.hbase.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.protobuf.generated.AdminProtos; +import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos; import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.HRegionFileSystem; import org.apache.hadoop.hbase.regionserver.HRegionServer; @@ -1423,7 +1425,8 @@ public class TestHBaseFsck { // TODO: fixHdfsHoles does not work against splits, since the parent dir lingers on // for some time until children references are deleted. HBCK erroneously sees this as // overlapping regions - HBaseFsck hbck = doFsck(conf, true, true, false, false, false, true, true, true, false, false, null); + HBaseFsck hbck = doFsck( + conf, true, true, false, false, false, true, true, true, false, false, false, null); assertErrors(hbck, new ERROR_CODE[] {}); //no LINGERING_SPLIT_PARENT reported // assert that the split hbase:meta entry is still there. @@ -1486,7 +1489,8 @@ public class TestHBaseFsck { ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN}); //no LINGERING_SPLIT_PARENT // now fix it. The fix should not revert the region split, but add daughters to META - hbck = doFsck(conf, true, true, false, false, false, false, false, false, false, false, null); + hbck = doFsck( + conf, true, true, false, false, false, false, false, false, false, false, false, null); assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN}); @@ -2134,7 +2138,7 @@ public class TestHBaseFsck { } } - @Test(timeout=60000) + @Test(timeout=180000) public void testCheckTableLocks() throws Exception { IncrementingEnvironmentEdge edge = new IncrementingEnvironmentEdge(0); EnvironmentEdgeManager.injectEdge(edge); @@ -2203,6 +2207,55 @@ public class TestHBaseFsck { writeLock.release(); // release for clean state } + /** + * Test orphaned table ZNode (for table states) + */ + @Test + public void testOrphanedTableZNode() throws Exception { + TableName table = TableName.valueOf("testOrphanedZKTableEntry"); + + try { + TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager().getZKTable(). + setEnablingTable(table); + + try { + setupTable(table); + Assert.fail( + "Create table should fail when its ZNode has already existed with ENABLING state."); + } catch(TableExistsException t) { + //Expected exception + } + // The setup table was interrupted in some state that needs to some cleanup. + try { + deleteTable(table); + } catch (IOException e) { + // Because create table failed, it is expected that the cleanup table would + // throw some exception. Ignore and continue. + } + + HBaseFsck hbck = doFsck(conf, false); + assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY)); + + // fix the orphaned ZK entry + hbck = doFsck(conf, true); + + // check that orpahned ZK table entry is gone. + hbck = doFsck(conf, false); + assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY)); + // Now create table should succeed. + setupTable(table); + } finally { + // This code could be called that either a table was created successfully or set up + // table failed in some unknown state. Therefore, clean up can either succeed or fail. + try { + deleteTable(table); + } catch (IOException e) { + // The cleanup table would throw some exception if create table failed in some state. + // Ignore this exception + } + } + } + @Test public void testMetaOffline() throws Exception { // check no errors @@ -2408,7 +2461,8 @@ public class TestHBaseFsck { // fix hole assertErrors( - doFsck(conf, false, true, false, false, false, false, false, false, false, false, null), + doFsck( + conf, false, true, false, false, false, false, false, false, false, false, false, null), new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.NOT_IN_META_OR_DEPLOYED }); diff --git hbase-server/src/test/java/org/apache/hadoop/hbase/util/hbck/HbckTestingUtil.java hbase-server/src/test/java/org/apache/hadoop/hbase/util/hbck/HbckTestingUtil.java index 1f6ec70..bfa3217 100644 --- hbase-server/src/test/java/org/apache/hadoop/hbase/util/hbck/HbckTestingUtil.java +++ hbase-server/src/test/java/org/apache/hadoop/hbase/util/hbck/HbckTestingUtil.java @@ -40,13 +40,14 @@ public class HbckTestingUtil { public static HBaseFsck doFsck( Configuration conf, boolean fix, TableName table) throws Exception { - return doFsck(conf, fix, fix, fix, fix,fix, fix, fix, fix, fix, fix, table); + return doFsck(conf, fix, fix, fix, fix,fix, fix, fix, fix, fix, fix, fix, table); } public static HBaseFsck doFsck(Configuration conf, boolean fixAssignments, boolean fixMeta, boolean fixHdfsHoles, boolean fixHdfsOverlaps, boolean fixHdfsOrphans, boolean fixTableOrphans, boolean fixVersionFile, boolean fixReferenceFiles, boolean fixEmptyMetaRegionInfo, boolean fixTableLocks, + boolean fixTableZnodes, TableName table) throws Exception { HBaseFsck fsck = new HBaseFsck(conf, exec); fsck.connect(); @@ -62,6 +63,7 @@ public class HbckTestingUtil { fsck.setFixReferenceFiles(fixReferenceFiles); fsck.setFixEmptyMetaCells(fixEmptyMetaRegionInfo); fsck.setFixTableLocks(fixTableLocks); + fsck.setFixTableZNodes(fixTableZnodes); if (table != null) { fsck.includeTable(table); }