diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java index 5a9344b..4a9802d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java @@ -31,6 +31,7 @@ import java.util.TreeMap; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hbase.classification.InterfaceAudience; @@ -46,6 +47,7 @@ import org.apache.hadoop.hbase.client.RegionReplicaUtil; import org.apache.hadoop.hbase.master.RegionState.State; import org.apache.hadoop.hbase.client.TableState; import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.util.Pair; /** @@ -591,6 +593,16 @@ public class RegionStates { State.FAILED_OPEN, State.FAILED_CLOSE, State.OFFLINE)) { LOG.info("Found region in " + state + " to be reassigned by SSH for " + sn); rits.add(hri); + } else if (isOneOfStates(state, State.SPLITTING_NEW)) { + try { + if (MetaTableAccessor.getRegion(server.getConnection(), state.getRegion() + .getEncodedNameAsBytes()) == null) { + FSUtils.deleteRegionDir(server.getConfiguration(), state.getRegion()); + } + } catch (IOException e) { + LOG.warn("Got exception while deleting " + state.getRegion() + + " directories from file system.", e); + } } else { LOG.warn("THIS SHOULD NOT HAPPEN: unexpected " + state); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/FSUtils.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/FSUtils.java index 50532a1..7cda55d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/FSUtils.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/FSUtils.java @@ -183,6 +183,21 @@ public abstract class FSUtils { } /** + * Delete the region directory if exists. + * @param conf + * @param hri + * @return True if deleted the region directory. + * @throws IOException + */ + public static boolean deleteRegionDir(final Configuration conf, final HRegionInfo hri) + throws IOException { + Path rootDir = getRootDir(conf); + FileSystem fs = rootDir.getFileSystem(conf); + return deleteDirectory(fs, + new Path(getTableDir(rootDir, hri.getTable()), hri.getEncodedName())); + } + + /** * Return the number of bytes that large input files should be optimally * be split into to minimize i/o time. * diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java index 94da7ab..f3ebdb0 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java @@ -53,7 +53,6 @@ import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; - import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -89,6 +88,7 @@ import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.classification.InterfaceStability; import org.apache.hadoop.hbase.client.Admin; import org.apache.hadoop.hbase.client.ClusterConnection; +import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.client.Delete; import org.apache.hadoop.hbase.client.Get; @@ -1996,6 +1996,42 @@ public class HBaseFsck extends Configured implements Closeable { return; } + HRegionInfo hri = hbi.getHdfsHRI(); + TableInfo tableInfo = tablesInfo.get(hri.getTable()); + if (tableInfo.regionsFromMeta.isEmpty()) { + for (HbckInfo h : regionInfoMap.values()) { + if (h.getTableName().equals(hri.getTable())) { + if (h.metaEntry != null) tableInfo.regionsFromMeta + .add((HRegionInfo) h.metaEntry); + } + } + Collections.sort(tableInfo.regionsFromMeta); + } + for (HRegionInfo region : tableInfo.regionsFromMeta) { + if (Bytes.compareTo(region.getStartKey(), hri.getStartKey()) <= 0 + && (region.getEndKey().length == 0 || Bytes.compareTo(region.getEndKey(), + hri.getEndKey()) >= 0) + && Bytes.compareTo(region.getStartKey(), hri.getEndKey()) <= 0) { + Path regionDir = hbi.getHdfsRegionDir(); + FileSystem fs = regionDir.getFileSystem(getConf()); + List familyDirs = FSUtils.getFamilyDirs(fs, regionDir); + for (Path familyDir : familyDirs) { + List referenceFilePaths = FSUtils.getReferenceFilePaths(fs, familyDir); + for (Path referenceFilePath : referenceFilePaths) { + Path parentRegionDir = + StoreFileInfo.getReferredToFile(referenceFilePath).getParent().getParent(); + if (parentRegionDir.toString().endsWith(region.getEncodedName())) { + LOG.warn(hri + " start and stop keys are in the range of " + region + + ". The region might not be cleaned up from hdfs when region " + region + + " split failed. Hence deleting from hdfs."); + HRegionFileSystem.deleteRegionFromFileSystem(getConf(), fs, + regionDir.getParent(), hri); + return; + } + } + } + } + } LOG.info("Patching hbase:meta with .regioninfo: " + hbi.getHdfsHRI()); int numReplicas = admin.getTableDescriptor(hbi.getTableName()).getRegionReplication(); HBaseFsckRepair.fixMetaHoleOnlineAndAddReplicas(getConf(), hbi.getHdfsHRI(), @@ -2325,6 +2361,9 @@ public class HBaseFsck extends Configured implements Closeable { final Multimap overlapGroups = TreeMultimap.create(RegionSplitCalculator.BYTES_COMPARATOR, cmp); + // list of regions derived from meta entries. + final List regionsFromMeta = new ArrayList(); + TableInfo(TableName name) { this.tableName = name; deployedOn = new TreeSet (); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java index 0d574f6..087ebe4 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java @@ -952,6 +952,44 @@ public class TestSplitTransactionOnCluster { } } + @Test (timeout=300000) + public void testSSHCleanupDaugtherRegionsOfAbortedSplit() throws Exception { + TableName table = TableName.valueOf("testSSHCleanupDaugtherRegionsOfAbortedSplit"); + try { + HTableDescriptor desc = new HTableDescriptor(table); + desc.addFamily(new HColumnDescriptor(Bytes.toBytes("f"))); + admin.createTable(desc); + HTable hTable = new HTable(cluster.getConfiguration(), desc.getTableName()); + for(int i = 1; i < 5; i++) { + Put p1 = new Put(("r"+i).getBytes()); + p1.add(Bytes.toBytes("f"), "q1".getBytes(), "v".getBytes()); + hTable.put(p1); + } + admin.flush(desc.getTableName()); + List regions = cluster.getRegions(desc.getTableName()); + int serverWith = cluster.getServerWith(regions.get(0).getRegionName()); + HRegionServer regionServer = cluster.getRegionServer(serverWith); + cluster.getServerWith(regions.get(0).getRegionName()); + SplitTransaction st = new SplitTransaction(regions.get(0), Bytes.toBytes("r3")); + st.prepare(); + st.stepsBeforePONR(regionServer, regionServer, false); + Path tableDir = + FSUtils.getTableDir(cluster.getMaster().getMasterFileSystem().getRootDir(), + desc.getTableName()); + tableDir.getFileSystem(cluster.getConfiguration()); + List regionDirs = + FSUtils.getRegionDirs(tableDir.getFileSystem(cluster.getConfiguration()), tableDir); + assertEquals(3,regionDirs.size()); + AssignmentManager am = cluster.getMaster().getAssignmentManager(); + am.processServerShutdown(regionServer.getServerName()); + regionDirs = + FSUtils.getRegionDirs(tableDir.getFileSystem(cluster.getConfiguration()), tableDir); + assertEquals(1,regionDirs.size()); + } finally { + TESTING_UTIL.deleteTable(table); + } + } + private void testSplitBeforeSettingSplittingInZKInternals() throws Exception { final TableName tableName = TableName.valueOf("testSplitBeforeSettingSplittingInZK"); try { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java index 4fa78f4..33bd337 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java @@ -88,6 +88,7 @@ import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.io.hfile.TestHFile; import org.apache.hadoop.hbase.master.AssignmentManager; import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.master.RegionStates; import org.apache.hadoop.hbase.master.TableLockManager; import org.apache.hadoop.hbase.master.TableLockManager.TableLock; @@ -96,6 +97,7 @@ import org.apache.hadoop.hbase.protobuf.generated.AdminProtos; import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.HRegionFileSystem; import org.apache.hadoop.hbase.regionserver.HRegionServer; +import org.apache.hadoop.hbase.regionserver.SplitTransaction; import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.testclassification.MiscTests; @@ -1173,6 +1175,61 @@ public class TestHBaseFsck { } } + @Test (timeout=180000) + public void testCleanUpDaughtersNotInMetaAfterFailedSplit() throws Exception { + TableName table = TableName.valueOf("testCleanUpDaughtersNotInMetaAfterFailedSplit"); + MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); + try { + HTableDescriptor desc = new HTableDescriptor(table); + desc.addFamily(new HColumnDescriptor(Bytes.toBytes("f"))); + admin.createTable(desc); + tbl = new HTable(cluster.getConfiguration(), desc.getTableName()); + for (int i = 0; i < 5; i++) { + Put p1 = new Put(("r" + i).getBytes()); + p1.add(Bytes.toBytes("f"), "q1".getBytes(), "v".getBytes()); + tbl.put(p1); + } + admin.flush(desc.getTableName()); + List regions = cluster.getRegions(desc.getTableName()); + int serverWith = cluster.getServerWith(regions.get(0).getRegionName()); + HRegionServer regionServer = cluster.getRegionServer(serverWith); + cluster.getServerWith(regions.get(0).getRegionName()); + SplitTransaction st = new SplitTransaction(regions.get(0), Bytes.toBytes("r3")); + st.prepare(); + st.stepsBeforePONR(regionServer, regionServer, false); + AssignmentManager am = cluster.getMaster().getAssignmentManager(); + Map regionsInTransition = am.getRegionStates().getRegionsInTransition(); + for (RegionState state : regionsInTransition.values()) { + am.regionOffline(state.getRegion()); + } + Map regionsMap = new HashMap(); + regionsMap.put(regions.get(0).getRegionInfo(), regionServer.getServerName()); + am.assign(regionsMap); + am.waitForAssignment(regions.get(0).getRegionInfo()); + HBaseFsck hbck = doFsck(conf, false); + assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED, + ERROR_CODE.NOT_IN_META_OR_DEPLOYED }); + // holes are separate from overlap groups + assertEquals(0, hbck.getOverlapGroups(table).size()); + + // fix hole + assertErrors( + doFsck(conf, false, true, false, false, false, false, false, false, false, false, null), + new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED, + ERROR_CODE.NOT_IN_META_OR_DEPLOYED }); + + // check that hole fixed + assertNoErrors(doFsck(conf, false)); + assertEquals(5, countRows()); + } finally { + if (tbl != null) { + tbl.close(); + tbl = null; + } + cleanupTable(table); + } + } + /** * This creates fixes a bad table with a hole in meta. */