diff --git src/main/java/org/apache/hadoop/hbase/master/snapshot/SnapshotFileCache.java src/main/java/org/apache/hadoop/hbase/master/snapshot/SnapshotFileCache.java index ec14aa1..3486c8e 100644 --- src/main/java/org/apache/hadoop/hbase/master/snapshot/SnapshotFileCache.java +++ src/main/java/org/apache/hadoop/hbase/master/snapshot/SnapshotFileCache.java @@ -19,14 +19,10 @@ package org.apache.hadoop.hbase.master.snapshot; import java.io.FileNotFoundException; import java.io.IOException; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; -import java.util.Timer; -import java.util.TimerTask; +import java.util.*; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Lists; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience; @@ -39,13 +35,14 @@ import org.apache.hadoop.hbase.Stoppable; import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils; import org.apache.hadoop.hbase.util.FSUtils; + /** * Intelligently keep track of all the files for all the snapshots. *

* A cache of files is kept to avoid querying the {@link FileSystem} frequently. If there is a cache * miss the directory modification time is used to ensure that we don't rescan directories that we * already have in cache. We only check the modification times of the snapshot directories - * (/hbase/.snapshot/[snapshot_name]) to determine if the files need to be loaded into the cache. + * (/hbase/.hbase-snapshot/[snapshot_name]) to determine if the files need to be loaded into the cache. *

* New snapshots will be added to the cache and deleted snapshots will be removed when we refresh * the cache. If the files underneath a snapshot directory are changed, but not the snapshot itself, @@ -63,8 +60,8 @@ import org.apache.hadoop.hbase.util.FSUtils; * This allows you to only cache files under, for instance, all the logs in the .logs directory or * all the files under all the regions. *

- * this also considers all running snapshots (those under /hbase/.snapshot/.tmp) as valid - * snapshots and will attempt to cache files from those snapshots as well. + * this also considers all running snapshots (those under /hbase/.hbase-snapshot/.tmp) as valid + * snapshots but will not attempt to cache files from that directory. *

* Queries about a given file are thread-safe with respect to multiple queries and cache refreshes. */ @@ -165,26 +162,40 @@ public class SnapshotFileCache implements Stoppable { * at that point, cache will still think the file system contains that file and return * true, even if it is no longer present (false positive). However, if the file never was * on the filesystem, we will never find it and always return false. - * @param fileName file to check - * @return false if the file is not referenced in any current or running snapshot, - * true if the file is in the cache. + * @param files file to check, NOTE: Relies that files are loaded from hdfs before method is called (NOT LAZY) + * @return unReferencedFiles the collection of files that do not have snapshot references * @throws IOException if there is an unexpected error reaching the filesystem. */ // XXX this is inefficient to synchronize on the method, when what we really need to guard against // is an illegal access to the cache. Really we could do a mutex-guarded pointer swap on the // cache, but that seems overkill at the moment and isn't necessarily a bottleneck. - public synchronized boolean contains(String fileName) throws IOException { - if (this.cache.contains(fileName)) return true; - + public synchronized Iterable getUnreferencedFiles(Iterable files) throws IOException { + List unReferencedFiles = Lists.newArrayList(); + List snapshotsInProgress = null; + boolean refreshed = false; + for (FileStatus file : files) { + String fileName = file.getPath().getName(); + if (!refreshed && !cache.contains(fileName)) { refreshCache(); - - // then check again - return this.cache.contains(fileName); + refreshed = true; + } + if (cache.contains(fileName)) { + continue; + } + if (snapshotsInProgress == null) { + snapshotsInProgress = getSnapshotsInProgress(); + } + if (snapshotsInProgress.contains(fileName)) { + continue; + } + unReferencedFiles.add(file); + } + return unReferencedFiles; } private synchronized void refreshCache() throws IOException { - // get the status of the snapshots directory and /.tmp - FileStatus dirStatus, tempStatus; + // get the status of the snapshots directory + FileStatus dirStatus; try { dirStatus = fs.getFileStatus(snapshotDir); } catch (FileNotFoundException e) { @@ -194,16 +205,7 @@ public class SnapshotFileCache implements Stoppable { return; } - try { - Path snapshotTmpDir = new Path(snapshotDir, SnapshotDescriptionUtils.SNAPSHOT_TMP_DIR_NAME); - tempStatus = fs.getFileStatus(snapshotTmpDir); - } catch (FileNotFoundException e) { - tempStatus = dirStatus; - } - - // if the snapshot directory wasn't modified since we last check, we are done - if (dirStatus.getModificationTime() <= lastModifiedTime && - tempStatus.getModificationTime() <= lastModifiedTime) { + if (dirStatus.getModificationTime() <= lastModifiedTime) { return; } @@ -213,8 +215,7 @@ public class SnapshotFileCache implements Stoppable { // However, snapshot directories are only created once, so this isn't an issue. // 1. update the modified time - this.lastModifiedTime = Math.min(dirStatus.getModificationTime(), - tempStatus.getModificationTime()); + this.lastModifiedTime = dirStatus.getModificationTime(); // 2.clear the cache this.cache.clear(); @@ -234,15 +235,7 @@ public class SnapshotFileCache implements Stoppable { // 3.1 iterate through the on-disk snapshots for (FileStatus snapshot : snapshots) { String name = snapshot.getPath().getName(); - // its the tmp dir - if (name.equals(SnapshotDescriptionUtils.SNAPSHOT_TMP_DIR_NAME)) { - // only add those files to the cache, but not to the known snapshots - FileStatus[] running = FSUtils.listStatus(fs, snapshot.getPath()); - if (running == null) continue; - for (FileStatus run : running) { - this.cache.addAll(fileInspector.filesUnderSnapshot(run.getPath())); - } - } else { + if (!name.equals(SnapshotDescriptionUtils.SNAPSHOT_TMP_DIR_NAME)) { SnapshotDirectoryInfo files = this.snapshots.remove(name); // 3.1.1 if we don't know about the snapshot or its been modified, we need to update the files // the latter could occur where I create a snapshot, then delete it, and then make a new @@ -264,6 +257,20 @@ public class SnapshotFileCache implements Stoppable { this.snapshots.putAll(known); } + @VisibleForTesting List getSnapshotsInProgress() throws IOException { + List snapshotInProgress = Lists.newArrayList(); + // only add those files to the cache, but not to the known snapshots + Path snapshotTmpDir = new Path(snapshotDir, SnapshotDescriptionUtils.SNAPSHOT_TMP_DIR_NAME); + // only add those files to the cache, but not to the known snapshots + FileStatus[] running = FSUtils.listStatus(fs, snapshotTmpDir); + if (running != null) { + for (FileStatus run : running) { + snapshotInProgress.addAll(fileInspector.filesUnderSnapshot(run.getPath())); + } + } + return snapshotInProgress; + } + /** * Simple helper task that just periodically attempts to refresh the cache */ @@ -321,4 +328,5 @@ public class SnapshotFileCache implements Stoppable { return this.lastModified < mtime; } } + } diff --git src/main/java/org/apache/hadoop/hbase/master/snapshot/SnapshotHFileCleaner.java src/main/java/org/apache/hadoop/hbase/master/snapshot/SnapshotHFileCleaner.java index e82ca16..0f3d311 100644 --- src/main/java/org/apache/hadoop/hbase/master/snapshot/SnapshotHFileCleaner.java +++ src/main/java/org/apache/hadoop/hbase/master/snapshot/SnapshotHFileCleaner.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hbase.master.snapshot; import java.io.IOException; import java.util.Collection; +import java.util.Collections; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -55,14 +56,18 @@ public class SnapshotHFileCleaner extends BaseHFileCleanerDelegate { private SnapshotFileCache cache; @Override - public synchronized boolean isFileDeletable(FileStatus fStat) { + public synchronized Iterable getDeletableFiles(Iterable files) { try { - return !cache.contains(fStat.getPath().getName()); + return cache.getUnreferencedFiles(files); } catch (IOException e) { - LOG.error("Exception while checking if:" + fStat.getPath() - + " was valid, keeping it just in case.", e); - return false; + LOG.error("Exception while checking if files were valid, keeping them just in case.", e); + return Collections.emptyList(); + } } + + @Override + protected boolean isFileDeletable(FileStatus fStat) { + return false; } @Override diff --git src/main/java/org/apache/hadoop/hbase/master/snapshot/SnapshotLogCleaner.java src/main/java/org/apache/hadoop/hbase/master/snapshot/SnapshotLogCleaner.java index c1edd6f..7da64c5 100644 --- src/main/java/org/apache/hadoop/hbase/master/snapshot/SnapshotLogCleaner.java +++ src/main/java/org/apache/hadoop/hbase/master/snapshot/SnapshotLogCleaner.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hbase.master.snapshot; import java.io.IOException; import java.util.Collection; +import java.util.Collections; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -54,14 +55,13 @@ public class SnapshotLogCleaner extends BaseLogCleanerDelegate { private SnapshotFileCache cache; @Override - public synchronized boolean isLogDeletable(FileStatus fStat) { + public synchronized Iterable getDeletableFiles(Iterable files) { + if (null == cache) return Collections.emptyList(); try { - if (null == cache) return false; - return !cache.contains(fStat.getPath().getName()); + return cache.getUnreferencedFiles(files); } catch (IOException e) { - LOG.error("Exception while checking if:" + fStat.getPath() - + " was valid, keeping it just in case.", e); - return false; + LOG.error("Exception while checking if files were valid, keeping them just in case.", e); + return Collections.emptyList(); } } diff --git src/test/java/org/apache/hadoop/hbase/master/snapshot/TestSnapshotFileCache.java src/test/java/org/apache/hadoop/hbase/master/snapshot/TestSnapshotFileCache.java index 409f697..5bb441c 100644 --- src/test/java/org/apache/hadoop/hbase/master/snapshot/TestSnapshotFileCache.java +++ src/test/java/org/apache/hadoop/hbase/master/snapshot/TestSnapshotFileCache.java @@ -17,15 +17,19 @@ */ package org.apache.hadoop.hbase.master.snapshot; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import java.io.IOException; -import java.util.Collection; -import java.util.HashSet; +import java.util.*; +import java.util.concurrent.atomic.AtomicInteger; +import com.google.common.collect.Iterables; +import com.google.common.collect.ObjectArrays; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseTestingUtility; @@ -85,6 +89,7 @@ public class TestSnapshotFileCache { Path file1 = new Path(family, "file1"); Path file2 = new Path(family, "file2"); + // create two hfiles under the snapshot fs.createNewFile(file1); fs.createNewFile(file2); @@ -92,10 +97,13 @@ public class TestSnapshotFileCache { FSUtils.logFileSystemState(fs, rootDir, LOG); // then make sure the cache finds them - assertTrue("Cache didn't find:" + file1, cache.contains(file1.getName())); - assertTrue("Cache didn't find:" + file2, cache.contains(file2.getName())); + Iterable nonSnapshotFiles = cache.getUnreferencedFiles( + Arrays.asList(FSUtils.listStatus(fs, family)) + ); + assertFalse("Cache didn't find:" + file1, Iterables.contains(nonSnapshotFiles, file1)); + assertFalse("Cache didn't find:" + file2, Iterables.contains(nonSnapshotFiles, file2)); String not = "file-shouldn't-be-found"; - assertFalse("Cache found '" + not + "', but it shouldn't have.", cache.contains(not)); + assertFalse("Cache found '" + not + "', but it shouldn't have.", Iterables.contains(nonSnapshotFiles, not)); // make sure we get a little bit of separation in the modification times // its okay if we sleep a little longer (b/c of GC pause), as long as we sleep a little @@ -110,21 +118,80 @@ public class TestSnapshotFileCache { LOG.debug("Checking to see if file is deleted."); - assertTrue("Cache didn't find:" + file1, cache.contains(file1.getName())); - assertTrue("Cache didn't find:" + file2, cache.contains(file2.getName())); + nonSnapshotFiles = cache.getUnreferencedFiles( + nonSnapshotFiles + ); + + assertFalse("Cache didn't find:" + file1, Iterables.contains(nonSnapshotFiles, file1)); + assertFalse("Cache didn't find:" + file2, Iterables.contains(nonSnapshotFiles, file2)); // then trigger a refresh cache.triggerCacheRefreshForTesting(); + + nonSnapshotFiles = cache.getUnreferencedFiles( + nonSnapshotFiles + ); // and not it shouldn't find those files assertFalse("Cache found '" + file1 + "', but it shouldn't have.", - cache.contains(file1.getName())); + Iterables.contains(nonSnapshotFiles, file1)); assertFalse("Cache found '" + file2 + "', but it shouldn't have.", - cache.contains(file2.getName())); + Iterables.contains(nonSnapshotFiles, file2)); fs.delete(snapshotDir, true); } @Test + public void testWeNeverCacheTmpDirAndLoadIt() throws Exception { + + final AtomicInteger count = new AtomicInteger(0); + // don't refresh the cache unless we tell it to + long period = Long.MAX_VALUE; + Path snapshotDir = SnapshotDescriptionUtils.getSnapshotsDir(rootDir); + SnapshotFileCache cache = new SnapshotFileCache(fs, rootDir, period, 10000000, + "test-snapshot-file-cache-refresh", new SnapshotFiles()) { + @Override + List getSnapshotsInProgress() throws IOException { + List result = super.getSnapshotsInProgress(); + count.incrementAndGet(); + return result; + } + }; + + // create a file in a 'completed' snapshot + Path snapshot = new Path(snapshotDir, "snapshot"); + Path region = new Path(snapshot, "7e91021"); + Path family = new Path(region, "fam"); + Path file1 = new Path(family, "file1"); + fs.createNewFile(file1); + + FileStatus[] completedFiles = FSUtils.listStatus(fs, family); + + // create an 'in progress' snapshot + SnapshotDescription desc = SnapshotDescription.newBuilder().setName("working").build(); + snapshot = SnapshotDescriptionUtils.getWorkingSnapshotDir(desc, rootDir); + region = new Path(snapshot, "7e91021"); + family = new Path(region, "fam"); + Path file2 = new Path(family, "file2"); + fs.createNewFile(file2); + cache.triggerCacheRefreshForTesting(); + + Iterable deletableFiles = cache.getUnreferencedFiles(Arrays.asList( + ObjectArrays.concat(completedFiles, FSUtils.listStatus(fs, family), FileStatus.class)) + ); + assertTrue(Iterables.isEmpty(deletableFiles)); + assertEquals(1, count.get()); // we check the tmp directory + + Path file3 = new Path(family, "file3"); + fs.create(file3); + deletableFiles = cache.getUnreferencedFiles(Arrays.asList( + ObjectArrays.concat(completedFiles, FSUtils.listStatus(fs, family), FileStatus.class)) + ); + assertTrue(Iterables.isEmpty(deletableFiles)); + assertEquals(2, count.get()); // we check the tmp directory + + } + + @Test public void testLoadsTmpDir() throws Exception { // don't refresh the cache unless we tell it to long period = Long.MAX_VALUE; @@ -150,8 +217,11 @@ public class TestSnapshotFileCache { FSUtils.logFileSystemState(fs, rootDir, LOG); // then make sure the cache finds both files - assertTrue("Cache didn't find:" + file1, cache.contains(file1.getName())); - assertTrue("Cache didn't find:" + file2, cache.contains(file2.getName())); + Iterable nonSnapshotFiles = cache.getUnreferencedFiles( + Arrays.asList(FSUtils.listStatus(fs, family)) + ); + assertFalse("Cache didn't find:" + file1, Iterables.contains(nonSnapshotFiles, file1)); + assertFalse("Cache didn't find:" + file2, Iterables.contains(nonSnapshotFiles, file2)); } @Test @@ -181,10 +251,13 @@ public class TestSnapshotFileCache { FSUtils.logFileSystemState(fs, rootDir, LOG); + Iterable nonSnapshotFiles = cache.getUnreferencedFiles( + Arrays.asList(FSUtils.listStatus(fs, family)) + ); // then make sure the cache only finds the log files assertFalse("Cache found '" + file1 + "', but it shouldn't have.", - cache.contains(file1.getName())); - assertTrue("Cache didn't find:" + log, cache.contains(log.getName())); + Iterables.contains(nonSnapshotFiles, file1)); + assertFalse("Cache didn't find:" + log, Iterables.contains(nonSnapshotFiles, log)); } @Test @@ -207,7 +280,10 @@ public class TestSnapshotFileCache { FSUtils.logFileSystemState(fs, rootDir, LOG); - assertTrue("Cache didn't find " + file1, cache.contains(file1.getName())); + Iterable nonSnapshotFiles = cache.getUnreferencedFiles( + Arrays.asList(FSUtils.listStatus(fs, family)) + ); + assertFalse("Cache didn't find " + file1, Iterables.contains(nonSnapshotFiles, file1)); // now delete the snapshot and add a file with a different name fs.delete(snapshot, true); @@ -215,13 +291,15 @@ public class TestSnapshotFileCache { fs.createNewFile(file3); FSUtils.logFileSystemState(fs, rootDir, LOG); - assertTrue("Cache didn't find new file:" + file3, cache.contains(file3.getName())); + nonSnapshotFiles = cache.getUnreferencedFiles( + Arrays.asList(FSUtils.listStatus(fs, family)) + ); + assertFalse("Cache didn't find new file:" + file3, Iterables.contains(nonSnapshotFiles, file3)); } @Test public void testSnapshotTempDirReload() throws IOException { long period = Long.MAX_VALUE; - // This doesn't refresh cache until we invoke it explicitly Path snapshotDir = new Path(SnapshotDescriptionUtils.getSnapshotsDir(rootDir), SnapshotDescriptionUtils.SNAPSHOT_TMP_DIR_NAME); SnapshotFileCache cache = new SnapshotFileCache(fs, rootDir, period, 10000000, @@ -231,13 +309,13 @@ public class TestSnapshotFileCache { Path snapshot1 = new Path(snapshotDir, "snapshot1"); Path file1 = new Path(new Path(new Path(snapshot1, "7e91021"), "fam"), "file1"); fs.createNewFile(file1); - assertTrue(cache.contains(file1.getName())); + assertTrue(cache.getSnapshotsInProgress().contains(file1.getName())); // Add another snapshot Path snapshot2 = new Path(snapshotDir, "snapshot2"); Path file2 = new Path(new Path(new Path(snapshot2, "7e91021"), "fam2"), "file2"); fs.createNewFile(file2); - assertTrue(cache.contains(file2.getName())); + assertTrue(cache.getSnapshotsInProgress().contains((file2.getName()))); } class SnapshotFiles implements SnapshotFileCache.SnapshotFileInspector {