From 184db4606d19ea7073b07f4819096945846a1077 Mon Sep 17 00:00:00 2001 From: Peter Somogyi Date: Tue, 10 Apr 2018 15:16:03 +0200 Subject: [PATCH] HBASE-20376 RowCounter and CellCounter documentations are incorrect --- bin/hbase | 6 +++ .../hadoop/hbase/mapreduce/CellCounter.java | 47 ++++++++++--------- .../hadoop/hbase/mapreduce/RowCounter.java | 6 +-- src/main/asciidoc/_chapters/ops_mgt.adoc | 31 +++++++----- 4 files changed, 54 insertions(+), 36 deletions(-) diff --git a/bin/hbase b/bin/hbase index 8e37f5f375..f1e2306cfb 100755 --- a/bin/hbase +++ b/bin/hbase @@ -106,6 +106,8 @@ if [ $# = 0 ]; then echo " backup Backup tables for recovery" echo " restore Restore tables from existing backup image" echo " regionsplitter Run RegionSplitter tool" + echo " rowcounter Run RowCounter tool" + echo " cellcounter Run CellCounter tool" echo " CLASSNAME Run the class named CLASSNAME" exit 1 fi @@ -465,6 +467,10 @@ elif [ "$COMMAND" = "version" ] ; then CLASS='org.apache.hadoop.hbase.util.VersionInfo' elif [ "$COMMAND" = "regionsplitter" ] ; then CLASS='org.apache.hadoop.hbase.util.RegionSplitter' +elif [ "$COMMAND" = "rowcounter" ] ; then + CLASS='org.apache.hadoop.hbase.mapreduce.RowCounter' +elif [ "$COMMAND" = "cellcounter" ] ; then + CLASS='org.apache.hadoop.hbase.mapreduce.CellCounter' else CLASS=$COMMAND fi diff --git a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/CellCounter.java b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/CellCounter.java index aa79aacfae..ff0f01ca19 100644 --- a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/CellCounter.java +++ b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/CellCounter.java @@ -292,33 +292,38 @@ public class CellCounter extends Configured implements Tool { @Override public int run(String[] args) throws Exception { if (args.length < 2) { - System.err.println("ERROR: Wrong number of parameters: " + args.length); - System.err.println("Usage: CellCounter "); - System.err.println(" [^[regex pattern] or " + - "[Prefix] for row filter]] --starttime=[starttime] --endtime=[endtime]"); - System.err.println(" Note: -D properties will be applied to the conf used. "); - System.err.println(" Additionally, all of the SCAN properties from TableInputFormat"); - System.err.println(" can be specified to get fine grained control on what is counted.."); - System.err.println(" -D " + TableInputFormat.SCAN_ROW_START + "="); - System.err.println(" -D " + TableInputFormat.SCAN_ROW_STOP + "="); - System.err.println(" -D " + TableInputFormat.SCAN_COLUMNS + "=\" ...\""); - System.err.println(" -D " + TableInputFormat.SCAN_COLUMN_FAMILY + "=,, ..."); - System.err.println(" -D " + TableInputFormat.SCAN_TIMESTAMP + "="); - System.err.println(" -D " + TableInputFormat.SCAN_TIMERANGE_START + "="); - System.err.println(" -D " + TableInputFormat.SCAN_TIMERANGE_END + "="); - System.err.println(" -D " + TableInputFormat.SCAN_MAXVERSIONS + "="); - System.err.println(" -D " + TableInputFormat.SCAN_CACHEDROWS + "="); - System.err.println(" -D " + TableInputFormat.SCAN_BATCHSIZE + "="); - System.err.println(" parameter can be used to override the default report separator " + - "string : used to separate the rowId/column family name and qualifier name."); - System.err.println(" [^[regex pattern] or [Prefix] parameter can be used to limit the cell counter count " + - "operation to a limited subset of rows from the table based on regex or prefix pattern."); + printUsage(args.length); return -1; } Job job = createSubmittableJob(getConf(), args); return (job.waitForCompletion(true) ? 0 : 1); } + private void printUsage(int parameterCount) { + System.err.println("ERROR: Wrong number of parameters: " + parameterCount); + System.err.println("Usage: hbase cellcounter [reportSeparator] " + + "[^[regex pattern] or [Prefix]] [--starttime= --endtime=]"); + System.err.println(" Note: -D properties will be applied to the conf used."); + System.err.println(" Additionally, all of the SCAN properties from TableInputFormat can be " + + "specified to get fine grained control on what is counted."); + System.err.println(" -D" + TableInputFormat.SCAN_ROW_START + "="); + System.err.println(" -D" + TableInputFormat.SCAN_ROW_STOP + "="); + System.err.println(" -D" + TableInputFormat.SCAN_COLUMNS + "=\" ...\""); + System.err.println(" -D" + TableInputFormat.SCAN_COLUMN_FAMILY + + "=,, ..."); + System.err.println(" -D" + TableInputFormat.SCAN_TIMESTAMP + "="); + System.err.println(" -D" + TableInputFormat.SCAN_TIMERANGE_START + "="); + System.err.println(" -D" + TableInputFormat.SCAN_TIMERANGE_END + "="); + System.err.println(" -D" + TableInputFormat.SCAN_MAXVERSIONS + "="); + System.err.println(" -D" + TableInputFormat.SCAN_CACHEDROWS + "="); + System.err.println(" -D" + TableInputFormat.SCAN_BATCHSIZE + "="); + System.err.println(" parameter can be used to override the default report " + + "separator string : used to separate the rowId/column family name and qualifier name."); + System.err.println(" [^[regex pattern] or [Prefix] parameter can be used to limit the cell " + + "counter count operation to a limited subset of rows from the table based on regex or " + + "prefix pattern."); + } + /** * Main entry point. * @param args The command line parameters. diff --git a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/RowCounter.java b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/RowCounter.java index 9c7b489181..7fa5dec5ef 100644 --- a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/RowCounter.java +++ b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/RowCounter.java @@ -221,9 +221,9 @@ public class RowCounter extends Configured implements Tool { * Note that we don't document --expected-count, because it's intended for test. */ private static void printUsage() { - System.err.println("Usage: RowCounter [options] " + - "[--starttime=[start] --endtime=[end] " + - "[--range=[startKey],[endKey][;[startKey],[endKey]...]] [ ...]"); + System.err.println("Usage: hbase rowcounter [options] " + + "[--starttime= --endtime=] " + + "[--range=[startKey],[endKey][;[startKey],[endKey]...]] [ ...]"); System.err.println("For performance consider the following options:\n" + "-Dhbase.client.scanner.caching=100\n" + "-Dmapreduce.map.speculative=false"); diff --git a/src/main/asciidoc/_chapters/ops_mgt.adoc b/src/main/asciidoc/_chapters/ops_mgt.adoc index 816773ec0c..f3480cf961 100644 --- a/src/main/asciidoc/_chapters/ops_mgt.adoc +++ b/src/main/asciidoc/_chapters/ops_mgt.adoc @@ -68,8 +68,12 @@ Some commands take arguments. Pass no args or -h for usage. pe Run PerformanceEvaluation ltt Run LoadTestTool canary Run the Canary tool - regionsplitter Run the RegionSplitter tool version Print the version + backup Backup tables for recovery + restore Restore tables from existing backup image + regionsplitter Run RegionSplitter tool + rowcounter Run RowCounter tool + cellcounter Run CellCounter tool CLASSNAME Run the class named CLASSNAME ---- @@ -709,24 +713,28 @@ WALPlayer, by default, runs as a mapreduce job. To NOT run WALPlayer as a mapreduce job on your cluster, force it to run all in the local process by adding the flags `-Dmapreduce.jobtracker.address=local` on the command line. [[rowcounter]] -=== RowCounter and CellCounter +=== RowCounter -link:https://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/RowCounter.html[RowCounter] is a mapreduce job to count all the rows of a table. +link:https://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/RowCounter.html[RowCounter] is a mapreduce job to count all the rows of a table. This is a good utility to use as a sanity check to ensure that HBase can read all the blocks of a table if there are any concerns of metadata inconsistency. -It will run the mapreduce all in a single process but it will run faster if you have a MapReduce cluster in place for it to exploit. It is also possible to limit -the time range of data to be scanned by using the `--starttime=[starttime]` and `--endtime=[endtime]` flags. +It will run the mapreduce all in a single process but it will run faster if you have a MapReduce cluster in place for it to exploit. +It is possible to limit the time range of data to be scanned by using the `--starttime=[starttime]` and `--endtime=[endtime]` flags. +The scanned data can be limited based on keys using the `--range=[startKey],[endKey][;[startKey],[endKey]...]` option. ---- -$ bin/hbase org.apache.hadoop.hbase.mapreduce.RowCounter [ ...] +$ bin/hbase rowcounter [options] [--starttime= --endtime=] [--range=[startKey],[endKey][;[startKey],[endKey]...]] [ ...] ---- RowCounter only counts one version per cell. -Note: caching for the input Scan is configured via `hbase.client.scanner.caching` in the job configuration. +For performance consider to use `-Dhbase.client.scanner.caching=100` and `-Dmapreduce.map.speculative=false` options. + +[[cellcounter]] +=== CellCounter HBase ships another diagnostic mapreduce job called link:https://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/CellCounter.html[CellCounter]. Like RowCounter, it gathers more fine-grained statistics about your table. -The statistics gathered by RowCounter are more fine-grained and include: +The statistics gathered by CellCounter are more fine-grained and include: * Total number of rows in the table. * Total number of CFs across all rows. @@ -737,12 +745,12 @@ The statistics gathered by RowCounter are more fine-grained and include: The program allows you to limit the scope of the run. Provide a row regex or prefix to limit the rows to analyze. -Specify a time range to scan the table by using the `--starttime=[starttime]` and `--endtime=[endtime]` flags. +Specify a time range to scan the table by using the `--starttime=` and `--endtime=` flags. Use `hbase.mapreduce.scan.column.family` to specify scanning a single column family. ---- -$ bin/hbase org.apache.hadoop.hbase.mapreduce.CellCounter [regex or prefix] +$ bin/hbase cellcounter [reportSeparator] [regex or prefix] [--starttime= --endtime=] ---- Note: just like RowCounter, caching for the input Scan is configured via `hbase.client.scanner.caching` in the job configuration. @@ -750,8 +758,7 @@ Note: just like RowCounter, caching for the input Scan is configured via `hbase. === mlockall It is possible to optionally pin your servers in physical memory making them less likely to be swapped out in oversubscribed environments by having the servers call link:http://linux.die.net/man/2/mlockall[mlockall] on startup. -See link:https://issues.apache.org/jira/browse/HBASE-4391[HBASE-4391 Add ability to - start RS as root and call mlockall] for how to build the optional library and have it run on startup. +See link:https://issues.apache.org/jira/browse/HBASE-4391[HBASE-4391 Add ability to start RS as root and call mlockall] for how to build the optional library and have it run on startup. [[compaction.tool]] === Offline Compaction Tool -- 2.17.0