From 8b6dc1d5a682503833646658d8c724d6356250c2 Mon Sep 17 00:00:00 2001 From: Mike Drob Date: Fri, 1 Jun 2018 20:59:50 -0500 Subject: [PATCH] HBASE-20674 Clean up SCR code and docs --- .../org/apache/hadoop/hbase/fs/HFileSystem.java | 51 +++++++------- .../hadoop/hbase/regionserver/HRegionServer.java | 3 +- .../java/org/apache/hadoop/hbase/util/FSUtils.java | 81 ---------------------- .../apache/hadoop/hbase/io/hfile/TestChecksum.java | 17 ++--- src/main/asciidoc/_chapters/performance.adoc | 65 ++++++++++++----- src/main/asciidoc/_chapters/schema_design.adoc | 34 +-------- 6 files changed, 83 insertions(+), 168 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/fs/HFileSystem.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/fs/HFileSystem.java index bc3d85e1f2..ee0b1c01e1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/fs/HFileSystem.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/fs/HFileSystem.java @@ -76,43 +76,40 @@ public class HFileSystem extends FilterFileSystem { * checksum verfication in hbase, otherwise * delegate checksum verification to the FileSystem. */ - public HFileSystem(Configuration conf, boolean useHBaseChecksum) - throws IOException { - - // Create the default filesystem with checksum verification switched on. - // By default, any operation to this FilterFileSystem occurs on - // the underlying filesystem that has checksums switched on. + public HFileSystem(final Configuration conf, final boolean useHBaseChecksum) throws IOException { + // The default file system that is used by all FilterFileSystem operations will have + // checksum verification turned on. We'll create an additional FileSystem handle later that may + // have checksum verification disabled, and that one will be used by our internal hot paths. this.fs = FileSystem.get(conf); this.useHBaseChecksum = useHBaseChecksum; - fs.initialize(getDefaultUri(conf), conf); + this.fs.initialize(getDefaultUri(conf), conf); + + boolean localFS = this.fs instanceof LocalFileSystem; // disable checksum verification for local fileSystem, see HBASE-11218 - if (fs instanceof LocalFileSystem) { - fs.setWriteChecksum(false); - fs.setVerifyChecksum(false); + if (localFS) { + this.fs.setWriteChecksum(false); + this.fs.setVerifyChecksum(false); } addLocationsOrderInterceptor(conf); - // If hbase checksum verification is switched on, then create a new - // filesystem object that has cksum verification turned off. - // We will avoid verifying checksums in the fs client, instead do it - // inside of hbase. - // If this is the local file system hadoop has a bug where seeks - // do not go to the correct location if setVerifyChecksum(false) is called. - // This manifests itself in that incorrect data is read and HFileBlocks won't be able to read - // their header magic numbers. See HBASE-5885 - if (useHBaseChecksum && !(fs instanceof LocalFileSystem)) { - conf = new Configuration(conf); - conf.setBoolean("dfs.client.read.shortcircuit.skip.checksum", true); - this.noChecksumFs = maybeWrapFileSystem(newInstanceFileSystem(conf), conf); - this.noChecksumFs.setVerifyChecksum(false); - } else { - this.noChecksumFs = maybeWrapFileSystem(fs, conf); - } - this.fs = maybeWrapFileSystem(this.fs, conf); + + // If HBase checksum verification is enabled, create a new FS instance with checksum + // verification turned off. We will avoid double-checking sums by skipping them in FS client + // and doing it only in HBase. + // LocalFS doesn't support SCR, so nothing left to configure in that case. + if (useHBaseChecksum && !localFS) { + Configuration checksumConf = new Configuration(conf); + // Not all Hadoop versions have HdfsClientConfigKeys.Read.ShortCircuit.SKIP_CHECKSUM_KEY + checksumConf.setBoolean("dfs.client.read.shortcircuit.skip.checksum", true); + this.noChecksumFs = maybeWrapFileSystem(newInstanceFileSystem(checksumConf), checksumConf); + this.noChecksumFs.setVerifyChecksum(false); + } else { + this.noChecksumFs = this.fs; + } } /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index db7052e4a2..6f051c9c2b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -141,6 +141,7 @@ import org.apache.hadoop.hbase.trace.SpanReceiverHost; import org.apache.hadoop.hbase.trace.TraceUtil; import org.apache.hadoop.hbase.util.Addressing; import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.CommonFSUtils; import org.apache.hadoop.hbase.util.CompressionTest; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.FSTableDescriptors; @@ -542,7 +543,7 @@ public class HRegionServer extends HasThread implements HFile.checkHFileVersion(this.conf); checkCodecs(this.conf); this.userProvider = UserProvider.instantiate(conf); - FSUtils.setupShortCircuitRead(this.conf); + CommonFSUtils.setupShortCircuitRead(this.conf); decorateRegionServerConfiguration(this.conf); // Disable usage of meta replicas in the regionserver diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/FSUtils.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/FSUtils.java index 53db140b28..473919fa1b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/FSUtils.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/FSUtils.java @@ -129,36 +129,6 @@ public abstract class FSUtils extends CommonFSUtils { return fileSystem instanceof DistributedFileSystem; } - /** - * Compare path component of the Path URI; e.g. if hdfs://a/b/c and /a/b/c, it will compare the - * '/a/b/c' part. If you passed in 'hdfs://a/b/c and b/c, it would return true. Does not consider - * schema; i.e. if schemas different but path or subpath matches, the two will equate. - * @param pathToSearch Path we will be trying to match. - * @param pathTail - * @return True if pathTail is tail on the path of pathToSearch - */ - public static boolean isMatchingTail(final Path pathToSearch, final Path pathTail) { - if (pathToSearch.depth() != pathTail.depth()) return false; - Path tailPath = pathTail; - String tailName; - Path toSearch = pathToSearch; - String toSearchName; - boolean result = false; - do { - tailName = tailPath.getName(); - if (tailName == null || tailName.length() <= 0) { - result = true; - break; - } - toSearchName = toSearch.getName(); - if (toSearchName == null || toSearchName.length() <= 0) break; - // Move up a parent on each path for next go around. Path doesn't let us go off the end. - tailPath = tailPath.getParent(); - toSearch = toSearch.getParent(); - } while(tailName.equals(toSearchName)); - return result; - } - public static FSUtils getInstance(FileSystem fs, Configuration conf) { String scheme = fs.getUri().getScheme(); if (scheme == null) { @@ -172,21 +142,6 @@ public abstract class FSUtils extends CommonFSUtils { return fsUtils; } - /** - * Delete the region directory if exists. - * @param conf - * @param hri - * @return True if deleted the region directory. - * @throws IOException - */ - public static boolean deleteRegionDir(final Configuration conf, final HRegionInfo hri) - throws IOException { - Path rootDir = getRootDir(conf); - FileSystem fs = rootDir.getFileSystem(conf); - return deleteDirectory(fs, - new Path(getTableDir(rootDir, hri.getTable()), hri.getEncodedName())); - } - /** * Create the specified file on the filesystem. By default, this will: *
    @@ -1664,42 +1619,6 @@ public abstract class FSUtils extends CommonFSUtils { LOG.info(overheadMsg); } - /** - * Do our short circuit read setup. - * Checks buffer size to use and whether to do checksumming in hbase or hdfs. - * @param conf - */ - public static void setupShortCircuitRead(final Configuration conf) { - // Check that the user has not set the "dfs.client.read.shortcircuit.skip.checksum" property. - boolean shortCircuitSkipChecksum = - conf.getBoolean("dfs.client.read.shortcircuit.skip.checksum", false); - boolean useHBaseChecksum = conf.getBoolean(HConstants.HBASE_CHECKSUM_VERIFICATION, true); - if (shortCircuitSkipChecksum) { - LOG.warn("Configuration \"dfs.client.read.shortcircuit.skip.checksum\" should not " + - "be set to true." + (useHBaseChecksum ? " HBase checksum doesn't require " + - "it, see https://issues.apache.org/jira/browse/HBASE-6868." : "")); - assert !shortCircuitSkipChecksum; //this will fail if assertions are on - } - checkShortCircuitReadBufferSize(conf); - } - - /** - * Check if short circuit read buffer size is set and if not, set it to hbase value. - * @param conf - */ - public static void checkShortCircuitReadBufferSize(final Configuration conf) { - final int defaultSize = HConstants.DEFAULT_BLOCKSIZE * 2; - final int notSet = -1; - // DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_BUFFER_SIZE_KEY is only defined in h2 - final String dfsKey = "dfs.client.read.shortcircuit.buffer.size"; - int size = conf.getInt(dfsKey, notSet); - // If a size is set, return -- we will use it. - if (size != notSet) return; - // But short circuit buffer size is normally not set. Put in place the hbase wanted size. - int hbaseSize = conf.getInt("hbase." + dfsKey, defaultSize); - conf.setIfUnset(dfsKey, Integer.toString(hbaseSize)); - } - /** * @param c * @return The DFSClient DFSHedgedReadMetrics instance or null if can't be found or not on hdfs. diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestChecksum.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestChecksum.java index dd8ebb3567..73138507ac 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestChecksum.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestChecksum.java @@ -20,6 +20,7 @@ package org.apache.hadoop.hbase.io.hfile; import static org.apache.hadoop.hbase.io.compress.Compression.Algorithm.GZ; import static org.apache.hadoop.hbase.io.compress.Compression.Algorithm.NONE; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import java.io.ByteArrayInputStream; @@ -27,10 +28,6 @@ import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; import java.nio.BufferUnderflowException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; @@ -107,9 +104,7 @@ public class TestChecksum { */ @Test public void testAllChecksumTypes() throws IOException { - List cktypes = new ArrayList<>(Arrays.asList(ChecksumType.values())); - for (Iterator itr = cktypes.iterator(); itr.hasNext(); ) { - ChecksumType cktype = itr.next(); + for (ChecksumType cktype : ChecksumType.values()) { Path path = new Path(TEST_UTIL.getDataTestDir(), "checksum" + cktype.getName()); FSDataOutputStream os = fs.create(path); HFileContext meta = new HFileContextBuilder() @@ -125,7 +120,7 @@ public class TestChecksum { os.close(); // Use hbase checksums. - assertEquals(true, hfs.useHBaseChecksum()); + assertTrue(hfs.useHBaseChecksum()); FSDataInputStreamWrapper is = new FSDataInputStreamWrapper(fs, path); meta = new HFileContextBuilder().withHBaseCheckSum(true).build(); @@ -183,7 +178,7 @@ public class TestChecksum { os.close(); // Use hbase checksums. - assertEquals(true, hfs.useHBaseChecksum()); + assertTrue(hfs.useHBaseChecksum()); // Do a read that purposely introduces checksum verification failures. FSDataInputStreamWrapper is = new FSDataInputStreamWrapper(fs, path); @@ -233,7 +228,7 @@ public class TestChecksum { // the configuration. In this case, we should not detect // any retries within hbase. HFileSystem newfs = new HFileSystem(TEST_UTIL.getConfiguration(), false); - assertEquals(false, newfs.useHBaseChecksum()); + assertFalse(newfs.useHBaseChecksum()); is = new FSDataInputStreamWrapper(newfs, path); hbr = new CorruptedFSReaderImpl(is, totalSize, newfs, path, meta); b = hbr.readBlockData(0, -1, pread, false); @@ -306,7 +301,7 @@ public class TestChecksum { ", expectedChunks=" + expectedChunks); // Verify hbase checksums. - assertEquals(true, hfs.useHBaseChecksum()); + assertTrue(hfs.useHBaseChecksum()); // Read data back from file. FSDataInputStream is = fs.open(path); diff --git a/src/main/asciidoc/_chapters/performance.adoc b/src/main/asciidoc/_chapters/performance.adoc index 866779ca78..ebeed57f9c 100644 --- a/src/main/asciidoc/_chapters/performance.adoc +++ b/src/main/asciidoc/_chapters/performance.adoc @@ -841,17 +841,26 @@ See the link:https://issues.apache.org/jira/browse/HDFS-1599[Umbrella Jira Ticke [[perf.hdfs.configs.localread]] === Leveraging local data -Since Hadoop 1.0.0 (also 0.22.1, 0.23.1, CDH3u3 and HDP 1.0) via link:https://issues.apache.org/jira/browse/HDFS-2246[HDFS-2246], it is possible for the DFSClient to take a "short circuit" and read directly from the disk instead of going through the DataNode when the data is local. +It is possible for the DFSClient that HBase uses to take a "short circuit" and read directly from +the disk instead of going through the DataNode when the data is local. What this means for HBase is that the RegionServers can read directly off their machine's disks instead of having to open a socket to talk to the DataNode, the former being generally much faster. See JD's link:http://files.meetup.com/1350427/hug_ebay_jdcryans.pdf[Performance Talk]. Also see link:http://search-hadoop.com/m/zV6dKrLCVh1[HBase, mail # dev - read short circuit] thread for more discussion around short circuit reads. +The exact numbers presented there may be out of date, and a few of the configuration properties +have moved around, but the general concepts still apply. To enable "short circuit" reads, it will depend on your version of Hadoop. The original shortcircuit read patch was much improved upon in Hadoop 2 in link:https://issues.apache.org/jira/browse/HDFS-347[HDFS-347]. -See http://blog.cloudera.com/blog/2013/08/how-improved-short-circuit-local-reads-bring-better-performance-and-security-to-hadoop/ for details on the difference between the old and new implementations. -See link:http://archive.cloudera.com/cdh4/cdh/4/hadoop/hadoop-project-dist/hadoop-hdfs/ShortCircuitLocalReads.html[Hadoop shortcircuit reads configuration page] for how to enable the latter, better version of shortcircuit. -For example, here is a minimal config. -enabling short-circuit reads added to _hbase-site.xml_: +See HDFS documentation at link:https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs +/ShortCircuitLocalReads.html[Short-Circuit Local Reads] for full detail on how to configure +and enable the SCR feature. + +Both the RegionServer and the DataNode will need to have the hadoop native `.so` library loaded +for SCR functionality to be available. Both ends of the connection, that is both the DFS +DataNode and HBase RegionServer, will need to be configured with matching properties to allow +communication to take place. + +For example, here is a minimal configuration snippet: [source,xml] ---- @@ -874,23 +883,47 @@ enabling short-circuit reads added to _hbase-site.xml_: ---- -Be careful about permissions for the directory that hosts the shared domain socket; dfsclient will complain if open to other than the hbase user. - -If you are running on an old Hadoop, one that is without link:https://issues.apache.org/jira/browse/HDFS-347[HDFS-347] but that has link:https://issues.apache.org/jira/browse/HDFS-2246[HDFS-2246], you must set two configurations. -First, the hdfs-site.xml needs to be amended. -Set the property `dfs.block.local-path-access.user` to be the _only_ user that can use the shortcut. -This has to be the user that started HBase. -Then in hbase-site.xml, set `dfs.client.read.shortcircuit` to be `true` - -Services -- at least the HBase RegionServers -- will need to be restarted in order to pick up the new configurations. +Other properties should be set for you by HBase internals, but in rare cases you may continue to +encounter issues. In those scenarios, you may need to perform advanced tuning. .dfs.client.read.shortcircuit.buffer.size [NOTE] ==== -The default for this value is too high when running on a highly trafficked HBase. -In HBase, if this value has not been set, we set it down from the default of 1M to 128k (Since HBase 0.98.0 and 0.96.1). See link:https://issues.apache.org/jira/browse/HBASE-8143[HBASE-8143 HBase on Hadoop 2 with local short circuit reads (ssr) causes OOM]). The Hadoop DFSClient in HBase will allocate a direct byte buffer of this size for _each_ block it has open; given HBase keeps its HDFS files open all the time, this can add up quickly. +The HDFS default for this value is too high when running HBase under heavy load. + +In HBase, if this value has not been configured, we lower it from the default of 1M to 128k. +See link:https://issues.apache.org/jira/browse/HBASE-8143[HBASE-8143] for discussion of "HBase on +Hadoop 2 with local short circuit reads (ssr) causes OOM". + +The Hadoop DFSClient in HBase will allocate a direct byte buffer of this size for _each_ block it +has open; given HBase keeps its HDFS files open all the time, this can add up quickly. ==== +Other configurations you may consider: + +* In `hbase-site.xml`, increase `hbase.hstore.min.locality.to.skip.major.compact` from the +default value of `0.0` (up to a max of `1.0`) to encourage more data locality during compactions. +A value of `0.7` has been experimentally shown to perform well, but likely needs additional +refinement based on specific workload. +* Make sure DataNodes have enough handlers for block transfers. In `hdfs-site.xml`, consider the +following parameters: +- `dfs.datanode.max.xcievers >= 8192` +- `dfs.datanode.handler.count =` number of spindles +* At least one user reported improvements after tuning `dfs.client.read.shortcircuit.streams.cache +.size` and `dfs.client.socketcache.capacity`. Documentation is sparse on these options, you may +end up reading source code if you want to adjust these. + +Be careful about permissions for the directory that hosts the shared domain socket; dfsclient will complain if open to other than the hbase user. + +HBase RegionServers will need to be restarted in order to pick up the new configurations. +Check the RegionServer logs after restart. You should only see complaint if misconfiguration. +Otherwise, shortcircuit read operates quietly in background. It does not provide metrics so +no optics on how effective it is but read latencies should show a marked improvement, especially if +good data locality, lots of random reads, and dataset is larger than available cache. + +For more on short-circuit reads, see Colin's old blog on rollout, +link:http://blog.cloudera.com/blog/2013/08/how-improved-short-circuit-local-reads-bring-better-performance-and-security-to-hadoop/[How Improved Short-Circuit Local Reads Bring Better Performance and Security to Hadoop]. + [[perf.hdfs.comp]] === Performance Comparisons of HBase vs. HDFS diff --git a/src/main/asciidoc/_chapters/schema_design.adoc b/src/main/asciidoc/_chapters/schema_design.adoc index fdbd18468c..4afac108bf 100644 --- a/src/main/asciidoc/_chapters/schema_design.adoc +++ b/src/main/asciidoc/_chapters/schema_design.adoc @@ -1148,38 +1148,8 @@ Detect regionserver failure as fast as reasonable. Set the following parameters: [[shortcircuit.reads]] === Optimize on the Server Side for Low Latency -Skip the network for local blocks when the RegionServer goes to read from HDFS by exploiting HDFS's -link:https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/ShortCircuitLocalReads.html[Short-Circuit Local Reads] facility. -Note how setup must be done both at the datanode and on the dfsclient ends of the conneciton -- i.e. at the RegionServer -and how both ends need to have loaded the hadoop native `.so` library. -After configuring your hadoop setting _dfs.client.read.shortcircuit_ to _true_ and configuring -the _dfs.domain.socket.path_ path for the datanode and dfsclient to share and restarting, next configure -the regionserver/dfsclient side. - -* In `hbase-site.xml`, set the following parameters: -- `dfs.client.read.shortcircuit = true` -- `dfs.client.read.shortcircuit.skip.checksum = true` so we don't double checksum (HBase does its own checksumming to save on i/os. See <> for more on this. -- `dfs.domain.socket.path` to match what was set for the datanodes. -- `dfs.client.read.shortcircuit.buffer.size = 131072` Important to avoid OOME -- hbase has a default it uses if unset, see `hbase.dfs.client.read.shortcircuit.buffer.size`; its default is 131072. -* Ensure data locality. In `hbase-site.xml`, set `hbase.hstore.min.locality.to.skip.major.compact = 0.7` (Meaning that 0.7 \<= n \<= 1) -* Make sure DataNodes have enough handlers for block transfers. In `hdfs-site.xml`, set the following parameters: -- `dfs.datanode.max.xcievers >= 8192` -- `dfs.datanode.handler.count =` number of spindles - -Check the RegionServer logs after restart. You should only see complaint if misconfiguration. -Otherwise, shortcircuit read operates quietly in background. It does not provide metrics so -no optics on how effective it is but read latencies should show a marked improvement, especially if -good data locality, lots of random reads, and dataset is larger than available cache. - -Other advanced configurations that you might play with, especially if shortcircuit functionality -is complaining in the logs, include `dfs.client.read.shortcircuit.streams.cache.size` and -`dfs.client.socketcache.capacity`. Documentation is sparse on these options. You'll have to -read source code. - -For more on short-circuit reads, see Colin's old blog on rollout, -link:http://blog.cloudera.com/blog/2013/08/how-improved-short-circuit-local-reads-bring-better-performance-and-security-to-hadoop/[How Improved Short-Circuit Local Reads Bring Better Performance and Security to Hadoop]. -The link:https://issues.apache.org/jira/browse/HDFS-347[HDFS-347] issue also makes for an -interesting read showing the HDFS community at its best (caveat a few comments). +See the performance section for more details about <>. === JVM Tuning -- 2.16.1