diff --git a/bin/graceful_stop.sh b/bin/graceful_stop.sh index c14bbd7..de070aa 100755 --- a/bin/graceful_stop.sh +++ b/bin/graceful_stop.sh @@ -23,13 +23,14 @@ # Move regions off a server then stop it. Optionally restart and reload. # Turn off the balancer before running this script. function usage { - echo "Usage: graceful_stop.sh [--config ] [-d] [-e] [--restart [--reload]] [--thrift] [--rest] " - echo " thrift If we should stop/start thrift before/after the hbase stop/start" - echo " rest If we should stop/start rest before/after the hbase stop/start" - echo " restart If we should restart after graceful stop" - echo " reload Move offloaded regions back on to the restarted server" - echo " d|debug Print helpful debug information" - echo " hostname Hostname of server we are to stop" + echo "Usage: graceful_stop.sh [--config ] [--restart [--reload]] [--thrift] [--rest] [--maxhtreads xx] " + echo " thrift If we should stop/start thrift before/after the hbase stop/start" + echo " rest If we should stop/start rest before/after the hbase stop/start" + echo " restart If we should restart after graceful stop" + echo " reload Move offloaded regions back on to the restarted server" + echo " d|debug Print helpful debug information" + echo " maxthreads xx Limit the number of threads used by the region mover. Default value is 1." + echo " hostname Hostname of server we are to stop" echo " e|failfast Set -e so exit immediately if any command exits with non-zero status" exit 1 } @@ -48,6 +49,7 @@ reload= debug= thrift= rest= +maxthreads=1 failfast= while [ $# -gt 0 ] do @@ -60,6 +62,7 @@ do -e) failfast=true; shift;; --debug) ;& -d) debug="--debug"; shift;; + --maxthreads) shift; maxthreads=$1; shift;; --) shift; break;; -*) usage ;; *) break;; # terminate while loop @@ -90,7 +93,7 @@ HBASE_BALANCER_STATE=`echo 'balance_switch false' | "$bin"/hbase --config ${HBAS log "Previous balancer state was $HBASE_BALANCER_STATE" log "Unloading $hostname region(s)" -HBASE_NOEXEC=true "$bin"/hbase --config ${HBASE_CONF_DIR} org.jruby.Main "$bin"/region_mover.rb --file=$filename $debug unload $hostname +HBASE_NOEXEC=true "$bin"/hbase --config ${HBASE_CONF_DIR} org.jruby.Main "$bin"/region_mover.rb --file=$filename $debug --maxthreads=$maxthreads unload $hostname log "Unloaded $hostname region(s)" # Stop the server(s). Have to put hostname into its own little file for hbase-daemons.sh @@ -121,7 +124,7 @@ if [ "$restart" != "" ]; then fi if [ "$reload" != "" ]; then log "Reloading $hostname region(s)" - HBASE_NOEXEC=true "$bin"/hbase --config ${HBASE_CONF_DIR} org.jruby.Main "$bin"/region_mover.rb --file=$filename $debug load $hostname + HBASE_NOEXEC=true "$bin"/hbase --config ${HBASE_CONF_DIR} org.jruby.Main "$bin"/region_mover.rb --file=$filename $debug --maxthreads=$maxthreads load $hostname log "Reloaded $hostname region(s)" fi fi diff --git a/bin/region_mover.rb b/bin/region_mover.rb index e7a8dd3..c532b08 100644 --- a/bin/region_mover.rb +++ b/bin/region_mover.rb @@ -40,6 +40,7 @@ import org.apache.commons.logging.LogFactory import org.apache.hadoop.hbase.protobuf.ProtobufUtil import org.apache.hadoop.hbase.ServerName import org.apache.hadoop.hbase.HRegionInfo +import org.apache.hadoop.hbase.TableName # Name of this script NAME = "region_mover" @@ -48,7 +49,7 @@ NAME = "region_mover" def getMetaTable(config) # Keep meta reference in ruby global if not $META - $META = HTable.new(config, HConstants::META_TABLE_NAME) + $META = HTable.new(config, TableName::META_TABLE_NAME.getName()) end return $META end @@ -299,34 +300,53 @@ def unloadRegions(options, hostname) filename = getFilename(options, hostname) deleteFile(filename) # Get an admin instance - admin = HBaseAdmin.new(config) - servers = getServers(admin) - # Remove the server we are unloading from from list of servers. - # Side-effect is the servername that matches this hostname - servername = stripServer(servers, hostname) - - # Remove the servers in our exclude list from list of servers. - servers = stripExcludes(servers, options[:excludesFile]) - puts "Valid region move targets: ", servers - movedRegions = java.util.ArrayList.new() - while true - rs = getRegions(config, servername) - break if rs.length == 0 - count = 0 - $LOG.info("Moving " + rs.length.to_s + " region(s) from " + servername + - " during this cycle"); - for r in rs - # Get a random server to move the region to. - server = servers[rand(servers.length)] - $LOG.info("Moving region " + r.getEncodedName() + " (" + count.to_s + - " of " + rs.length.to_s + ") to server=" + server); - count = count + 1 - # Assert we can scan region in its current location - isSuccessfulScan(admin, r) - # Now move it. - move(admin, r, server, servername) - movedRegions.add(r) + connection = HConnectionManager.createConnection(config) + begin + admin = HBaseAdmin.new(connection) + servers = getServers(admin) + # Remove the server we are unloading from from list of servers. + # Side-effect is the servername that matches this hostname + servername = stripServer(servers, hostname) + + # Remove the servers in our exclude list from list of servers. + servers = stripExcludes(servers, options[:excludesFile]) + puts "Valid region move targets: ", servers + movedRegions = java.util.ArrayList.new() + while true + rs = getRegions(config, servername) + break if rs.length == 0 + count = 0 + $LOG.info("Moving " + rs.length.to_s + " region(s) from " + servername + + " during this cycle"); + counter = 0 + while counter < rs.length do + server_index = 0 + threads = [] + while server_index < servers.length && counter < rs.length && server_index < options[:maxthreads] do + count += 1 + destination_server = server_index + # If we have less threads than servers, then choose servers randomly to not always pickup + # the same first x servers. + if options[:maxthreads] < servers.length + destination_server = rand(servers.length) + end + $LOG.info("Moving region " + rs[counter].getEncodedName() + " (" + count.to_s + + " of " + rs.length.to_s + ") to server=" + servers[destination_server]); + threads << Thread.new(rs[counter], servers[destination_server]) { |region,server| + # Assert we can scan region in its current location + isSuccessfulScan(admin, region) + # Now move it. + move(admin, region, server, servername) + movedRegions.add(region) + } + server_index += 1 + counter += 1 + end + threads.each { |aThread| aThread.join } + end end + ensure + connection.close() if connection != nil end if movedRegions.size() > 0 # Write out file of regions moved @@ -340,45 +360,61 @@ def loadRegions(options, hostname) # Get configuration config = getConfiguration() # Get an admin instance - admin = HBaseAdmin.new(config) - filename = getFilename(options, hostname) - regions = readFile(filename) - return if regions.isEmpty() - servername = nil - # Wait till server is up - maxWaitInSeconds = admin.getConfiguration.getInt("hbase.serverstart.wait.max", 180) - maxWait = Time.now + maxWaitInSeconds - while Time.now < maxWait - servers = getServers(admin) - begin - servername = getServerName(servers, hostname) - rescue ArgumentError => e - $LOG.info("hostname=" + hostname.to_s + " is not up yet, waiting"); + connection = HConnectionManager.createConnection(config) + begin + admin = HBaseAdmin.new(config) + filename = getFilename(options, hostname) + regions = readFile(filename) + return if regions.isEmpty() + servername = nil + # Wait till server is up + maxWaitInSeconds = admin.getConfiguration.getInt("hbase.serverstart.wait.max", 180) + maxWait = Time.now + maxWaitInSeconds + while Time.now < maxWait + servers = getServers(admin) + begin + servername = getServerName(servers, hostname) + rescue ArgumentError => e + $LOG.info("hostname=" + hostname.to_s + " is not up yet, waiting"); + end + break if servername + sleep 0.5 end - break if servername - sleep 0.5 + ensure + connection.close() if connection != nil end $LOG.info("Moving " + regions.size().to_s + " regions to " + servername) count = 0 - for r in regions - exists = false - begin - isSuccessfulScan(admin, r) - exists = true - rescue org.apache.hadoop.hbase.NotServingRegionException => e - $LOG.info("Failed scan of " + e.message) - end - count = count + 1 - next unless exists - currentServer = getServerNameForRegion(admin, r) - if currentServer and currentServer == servername - $LOG.info("Region " + r.getRegionNameAsString() + " (" + count.to_s + - " of " + regions.length.to_s + ") already on target server=" + servername) - next + counter = 0 + while counter < regions.length do + thread_index = 0 + threads = [] + while counter < regions.length && thread_index < options[:maxthreads] do + r = regions[counter] + counter = counter + 1 + exists = false + begin + isSuccessfulScan(admin, r) + exists = true + rescue org.apache.hadoop.hbase.NotServingRegionException => e + $LOG.info("Failed scan of " + e.message) + end + count = count + 1 + next unless exists + currentServer = getServerNameForRegion(admin, r) + if currentServer and currentServer == servername + $LOG.info("Region " + r.getRegionNameAsString() + " (" + count.to_s + + " of " + regions.length.to_s + ") already on target server=" + servername) + next + end + $LOG.info("Moving region " + r.getEncodedName() + " (" + count.to_s + + " of " + regions.length.to_s + ") to server=" + servername + " in thread " + thread_index.to_s); + threads << Thread.new(r) { |region| + move(admin, region, servername, currentServer) + } + thread_index += 1 end - $LOG.info("Moving region " + r.getEncodedName() + " (" + count.to_s + - " of " + regions.length.to_s + ") to server=" + servername); - move(admin, r, servername, currentServer) + threads.each { |aThread| aThread.join } end end @@ -420,6 +456,7 @@ optparse = OptionParser.new do |opts| opts.banner = "Usage: #{NAME}.rb [options] load|unload " opts.separator 'Load or unload regions by moving one at a time' options[:file] = nil + options[:maxthreads] = 1 opts.on('-f', '--filename=FILE', 'File to save regions list into unloading, or read from loading; default /tmp/') do |file| options[:file] = file end @@ -434,6 +471,9 @@ optparse = OptionParser.new do |opts| opts.on('-x', '--excludefile=FILE', 'File with hosts-per-line to exclude as unload targets; default excludes only target host; useful for rack decommisioning.') do |file| options[:excludesFile] = file end + opts.on('-m', '--maxthreads=XX', 'Define the maximum number of threads to use to unload and reload the regions') do |number| + options[:maxthreads] = number.to_i + end end optparse.parse! @@ -448,7 +488,8 @@ if not hostname exit 2 end # Create a logger and save it to ruby global -$LOG = configureLogging(options) +$LOG = configureLogging(options) + case ARGV[0] when 'load' loadRegions(options, hostname) diff --git a/bin/rolling-restart.sh b/bin/rolling-restart.sh index 23f8d32..01a22b9 100755 --- a/bin/rolling-restart.sh +++ b/bin/rolling-restart.sh @@ -34,7 +34,7 @@ # # Modelled after $HADOOP_HOME/bin/slaves.sh. -usage="Usage: $0 [--config ] [--rs-only] [--master-only] [--graceful]" +usage="Usage: $0 [--config ] [--rs-only] [--master-only] [--graceful] [--maxthreads xx]" bin=`dirname "$0"` bin=`cd "$bin">/dev/null; pwd` @@ -57,23 +57,33 @@ function usage() { RR_RS=1 RR_MASTER=1 RR_GRACEFUL=0 +RR_MAXTHREADS=1 -for x in "$@" ; do - case "$x" in +while [ $# -gt 0 ] +do + case "$1" in --rs-only|-r) RR_RS=1 RR_MASTER=0 RR_GRACEFUL=0 + shift ;; --master-only) RR_RS=0 RR_MASTER=1 RR_GRACEFUL=0 + shift ;; --graceful) RR_RS=0 RR_MASTER=0 RR_GRACEFUL=1 + shift + ;; + --maxthreads) + shift + RR_MAXTHREADS=$1 + shift ;; *) echo Bad argument: $x @@ -158,7 +168,7 @@ else rs_parts=(${rs//,/ }) hostname=${rs_parts[0]} echo "Gracefully restarting: $hostname" - "$bin"/graceful_stop.sh --config "${HBASE_CONF_DIR}" --restart --reload --debug "$hostname" + "$bin"/graceful_stop.sh --config "${HBASE_CONF_DIR}" --restart --reload --debug --maxthreads "${RR_MAXTHREADS}" "$hostname" sleep 1 done fi