diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java index cd524b5..6e634f2 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java @@ -31,12 +31,16 @@ import java.util.TreeMap; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HRegionInfo; +import org.apache.hadoop.hbase.HRegionLocation; import org.apache.hadoop.hbase.MetaTableAccessor; import org.apache.hadoop.hbase.Server; import org.apache.hadoop.hbase.ServerLoad; @@ -45,7 +49,9 @@ import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.RegionReplicaUtil; import org.apache.hadoop.hbase.master.RegionState.State; import org.apache.hadoop.hbase.client.TableState; +import org.apache.hadoop.hbase.regionserver.HRegionFileSystem; import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.util.Pair; /** @@ -591,6 +597,21 @@ public class RegionStates { State.FAILED_OPEN, State.FAILED_CLOSE, State.OFFLINE)) { LOG.info("Found region in " + state + " to be reassigned by SSH for " + sn); rits.add(hri); + } else if (isOneOfStates(state, State.SPLITTING_NEW)) { + try { + HRegionLocation r = + MetaTableAccessor.getRegionLocation(server.getConnection(), state.getRegion() + .getEncodedNameAsBytes()); + if (r == null) { + Configuration conf = this.server.getConfiguration(); + MasterFileSystem masterFileSystem = ((HMaster) server).getMasterFileSystem(); + FileSystem fs = masterFileSystem.getFileSystem(); + Path tableDir = FSUtils.getTableDir(masterFileSystem.getRootDir(), hri.getTable()); + HRegionFileSystem.deleteRegionFromFileSystem(conf, fs, tableDir, hri); + } + } catch (IOException e) { + // Nothing to do here. + } } else { LOG.warn("THIS SHOULD NOT HAPPEN: unexpected " + state); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java index e507df4..4086796 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java @@ -49,7 +49,6 @@ import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; - import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -85,6 +84,7 @@ import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.classification.InterfaceStability; import org.apache.hadoop.hbase.client.Admin; import org.apache.hadoop.hbase.client.ClusterConnection; +import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.client.Delete; import org.apache.hadoop.hbase.client.Get; @@ -1974,12 +1974,37 @@ public class HBaseFsck extends Configured implements Closeable { " used."); return; } - + Connection conn = ConnectionFactory.createConnection(getConf()); + HRegionInfo hri = hbi.getHdfsHRI(); + List tableRegions = MetaTableAccessor.getTableRegions(conn, hri.getTable(), true); + if(tableRegions != null) { + for (HRegionInfo region : tableRegions) { + if (Bytes.compareTo(region.getStartKey(), hri.getStartKey()) <= 0 + && (region.getEndKey().length == 0 || Bytes.compareTo(region.getEndKey(), + hri.getEndKey()) >= 0) && Bytes.compareTo(region.getStartKey(), hri.getEndKey()) <= 0) { + Path regionDir = hbi.getHdfsRegionDir(); + FileSystem fs = regionDir.getFileSystem(getConf()); + List familyDirs = FSUtils.getFamilyDirs(fs, regionDir); + for(Path familyDir : familyDirs) { + List referenceFilePaths = FSUtils.getReferenceFilePaths(fs, familyDir); + for(Path referenceFilePath : referenceFilePaths) { + Path parentRegionDir = StoreFileInfo.getReferredToFile(referenceFilePath).getParent().getParent(); + if (parentRegionDir.toString().endsWith(region.getEncodedName())) { + LOG.warn(hri + " start and stop keys are in the range of " + region + + ". The region might not be cleaned up from hdfs when region " + region + + " split failed. Hence deleting from hdfs."); + HRegionFileSystem.deleteRegionFromFileSystem(getConf(), fs, regionDir.getParent(), hri); + return; + } + } + } + } + } + } LOG.info("Patching hbase:meta with .regioninfo: " + hbi.getHdfsHRI()); int numReplicas = admin.getTableDescriptor(hbi.getTableName()).getRegionReplication(); HBaseFsckRepair.fixMetaHoleOnlineAndAddReplicas(getConf(), hbi.getHdfsHRI(), admin.getClusterStatus().getServers(), numReplicas); - tryAssignmentRepair(hbi, "Trying to reassign region..."); } @@ -2581,7 +2606,6 @@ public class HBaseFsck extends Configured implements Closeable { + ". Just continuing... ", ioe); } } - // create new empty container region. HTableDescriptor htd = getTableInfo().getHTD(); // from start key to end Key diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java index 4138027..5b20df3 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java @@ -935,6 +935,44 @@ public class TestSplitTransactionOnCluster { } } + @Test (timeout=300000) + public void testSSHCleanupDaugtherRegionsOfAbortedSplit() throws Exception { + TableName table = TableName.valueOf("testSSHCleanupDaugtherRegionsOfAbortedSplit"); + try { + HTableDescriptor desc = new HTableDescriptor(table); + desc.addFamily(new HColumnDescriptor(Bytes.toBytes("f"))); + admin.createTable(desc); + HTable hTable = new HTable(cluster.getConfiguration(), desc.getTableName()); + for(int i = 1; i < 5; i++) { + Put p1 = new Put(("r"+i).getBytes()); + p1.add(Bytes.toBytes("f"), "q1".getBytes(), "v".getBytes()); + hTable.put(p1); + } + admin.flush(desc.getTableName()); + List regions = cluster.getRegions(desc.getTableName()); + int serverWith = cluster.getServerWith(regions.get(0).getRegionName()); + HRegionServer regionServer = cluster.getRegionServer(serverWith); + cluster.getServerWith(regions.get(0).getRegionName()); + SplitTransaction st = new SplitTransaction(regions.get(0), Bytes.toBytes("r3")); + st.prepare(); + st.stepsBeforePONR(regionServer, regionServer, false); + Path tableDir = + FSUtils.getTableDir(cluster.getMaster().getMasterFileSystem().getRootDir(), + desc.getTableName()); + tableDir.getFileSystem(cluster.getConfiguration()); + List regionDirs = + FSUtils.getRegionDirs(tableDir.getFileSystem(cluster.getConfiguration()), tableDir); + assertEquals(3,regionDirs.size()); + AssignmentManager am = cluster.getMaster().getAssignmentManager(); + am.processServerShutdown(regionServer.getServerName()); + regionDirs = + FSUtils.getRegionDirs(tableDir.getFileSystem(cluster.getConfiguration()), tableDir); + assertEquals(1,regionDirs.size()); + } finally { + TESTING_UTIL.deleteTable(table); + } + } + private void testSplitBeforeSettingSplittingInZKInternals() throws Exception { final TableName tableName = TableName.valueOf("testSplitBeforeSettingSplittingInZK"); try { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java index e13d7d4..b228bcf 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java @@ -88,6 +88,7 @@ import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.io.hfile.TestHFile; import org.apache.hadoop.hbase.master.AssignmentManager; import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.master.RegionStates; import org.apache.hadoop.hbase.master.TableLockManager; import org.apache.hadoop.hbase.master.TableLockManager.TableLock; @@ -96,6 +97,7 @@ import org.apache.hadoop.hbase.protobuf.generated.AdminProtos; import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.HRegionFileSystem; import org.apache.hadoop.hbase.regionserver.HRegionServer; +import org.apache.hadoop.hbase.regionserver.SplitTransaction; import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.testclassification.MiscTests; @@ -1175,6 +1177,64 @@ public class TestHBaseFsck { } /** + * This creates and fixes a bad table with a region that is missing meta and + * not assigned to a region server. + */ + @Test (timeout=180000) + public void testCleanUpDaughtersNotInMetaAfterFailedSplit() throws Exception { + TableName table = + TableName.valueOf("testCleanUpDaughtersNotInMetaAfterFailedSplit"); + MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); + try { + HTableDescriptor desc = new HTableDescriptor(table); + desc.addFamily(new HColumnDescriptor(Bytes.toBytes("f"))); + admin.createTable(desc); + tbl = new HTable(cluster.getConfiguration(), desc.getTableName()); + for(int i = 0; i < 5; i++) { + Put p1 = new Put(("r"+i).getBytes()); + p1.add(Bytes.toBytes("f"), "q1".getBytes(), "v".getBytes()); + tbl.put(p1); + } + admin.flush(desc.getTableName()); + List regions = cluster.getRegions(desc.getTableName()); + int serverWith = cluster.getServerWith(regions.get(0).getRegionName()); + HRegionServer regionServer = cluster.getRegionServer(serverWith); + cluster.getServerWith(regions.get(0).getRegionName()); + SplitTransaction st = new SplitTransaction(regions.get(0), Bytes.toBytes("r3")); + st.prepare(); + st.stepsBeforePONR(regionServer, regionServer, false); + AssignmentManager am = cluster.getMaster().getAssignmentManager(); + Map regionsInTransition = am.getRegionStates().getRegionsInTransition(); + for(RegionState state : regionsInTransition.values()) { + am.regionOffline(state.getRegion()); + } + Map regionsMap = new HashMap(); + regionsMap.put(regions.get(0).getRegionInfo(), regionServer.getServerName()); + am.assign(regionsMap); + am.waitForAssignment(regions.get(0).getRegionInfo()); + HBaseFsck hbck = doFsck(conf, false); + assertErrors(hbck, new ERROR_CODE[] { + ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.NOT_IN_META_OR_DEPLOYED}); + // holes are separate from overlap groups + assertEquals(0, hbck.getOverlapGroups(table).size()); + + // fix hole + assertErrors(doFsck(conf, false, true, false, false, false, false, false, false, false, false, null) , new ERROR_CODE[] { + ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.NOT_IN_META_OR_DEPLOYED}); + + // check that hole fixed + assertNoErrors(doFsck(conf,false)); + assertEquals(5, countRows()); + } finally { + if(tbl != null) { + tbl.close(); + tbl = null; + } + cleanupTable(table); + } + } + + /** * This creates fixes a bad table with a hole in meta. */ @Test (timeout=180000)