From ec1e9722b96991dbd6905e403acffb96e90b0695 Mon Sep 17 00:00:00 2001 From: Peter Somogyi Date: Tue, 10 Apr 2018 15:16:03 +0200 Subject: [PATCH] HBASE-20376 RowCounter and CellCounter documentations are incorrect --- .../apache/hadoop/hbase/mapreduce/CellCounter.java | 48 ++++++++++++---------- .../apache/hadoop/hbase/mapreduce/RowCounter.java | 6 +-- src/main/asciidoc/_chapters/ops_mgt.adoc | 24 ++++++----- 3 files changed, 43 insertions(+), 35 deletions(-) diff --git a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/CellCounter.java b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/CellCounter.java index aa79aacfae..1bea871ed9 100644 --- a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/CellCounter.java +++ b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/CellCounter.java @@ -292,33 +292,39 @@ public class CellCounter extends Configured implements Tool { @Override public int run(String[] args) throws Exception { if (args.length < 2) { - System.err.println("ERROR: Wrong number of parameters: " + args.length); - System.err.println("Usage: CellCounter "); - System.err.println(" [^[regex pattern] or " + - "[Prefix] for row filter]] --starttime=[starttime] --endtime=[endtime]"); - System.err.println(" Note: -D properties will be applied to the conf used. "); - System.err.println(" Additionally, all of the SCAN properties from TableInputFormat"); - System.err.println(" can be specified to get fine grained control on what is counted.."); - System.err.println(" -D " + TableInputFormat.SCAN_ROW_START + "="); - System.err.println(" -D " + TableInputFormat.SCAN_ROW_STOP + "="); - System.err.println(" -D " + TableInputFormat.SCAN_COLUMNS + "=\" ...\""); - System.err.println(" -D " + TableInputFormat.SCAN_COLUMN_FAMILY + "=,, ..."); - System.err.println(" -D " + TableInputFormat.SCAN_TIMESTAMP + "="); - System.err.println(" -D " + TableInputFormat.SCAN_TIMERANGE_START + "="); - System.err.println(" -D " + TableInputFormat.SCAN_TIMERANGE_END + "="); - System.err.println(" -D " + TableInputFormat.SCAN_MAXVERSIONS + "="); - System.err.println(" -D " + TableInputFormat.SCAN_CACHEDROWS + "="); - System.err.println(" -D " + TableInputFormat.SCAN_BATCHSIZE + "="); - System.err.println(" parameter can be used to override the default report separator " + - "string : used to separate the rowId/column family name and qualifier name."); - System.err.println(" [^[regex pattern] or [Prefix] parameter can be used to limit the cell counter count " + - "operation to a limited subset of rows from the table based on regex or prefix pattern."); + printUsage(args.length); return -1; } Job job = createSubmittableJob(getConf(), args); return (job.waitForCompletion(true) ? 0 : 1); } + private void printUsage(int parameterCount) { + System.err.println("ERROR: Wrong number of parameters: " + parameterCount); + System.err.println("Usage: hbase org.apache.hadoop.hbase.mapreduce.CellCounter"); + System.err.println(" [reportSeparator] [^[regex pattern] or " + + "[Prefix]] [--starttime= --endtime=]"); + System.err.println(" Note: -D properties will be applied to the conf used."); + System.err.println(" Additionally, all of the SCAN properties from TableInputFormat can be " + + "specified to get fine grained control on what is counted."); + System.err.println(" -D" + TableInputFormat.SCAN_ROW_START + "="); + System.err.println(" -D" + TableInputFormat.SCAN_ROW_STOP + "="); + System.err.println(" -D" + TableInputFormat.SCAN_COLUMNS + "=\" ...\""); + System.err.println(" -D" + TableInputFormat.SCAN_COLUMN_FAMILY + + "=,, ..."); + System.err.println(" -D" + TableInputFormat.SCAN_TIMESTAMP + "="); + System.err.println(" -D" + TableInputFormat.SCAN_TIMERANGE_START + "="); + System.err.println(" -D" + TableInputFormat.SCAN_TIMERANGE_END + "="); + System.err.println(" -D" + TableInputFormat.SCAN_MAXVERSIONS + "="); + System.err.println(" -D" + TableInputFormat.SCAN_CACHEDROWS + "="); + System.err.println(" -D" + TableInputFormat.SCAN_BATCHSIZE + "="); + System.err.println(" parameter can be used to override the default report " + + "separator string : used to separate the rowId/column family name and qualifier name."); + System.err.println(" [^[regex pattern] or [Prefix] parameter can be used to limit the cell " + + "counter count operation to a limited subset of rows from the table based on regex or " + + "prefix pattern."); + } + /** * Main entry point. * @param args The command line parameters. diff --git a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/RowCounter.java b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/RowCounter.java index 9c7b489181..641989157b 100644 --- a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/RowCounter.java +++ b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/RowCounter.java @@ -221,9 +221,9 @@ public class RowCounter extends Configured implements Tool { * Note that we don't document --expected-count, because it's intended for test. */ private static void printUsage() { - System.err.println("Usage: RowCounter [options] " + - "[--starttime=[start] --endtime=[end] " + - "[--range=[startKey],[endKey][;[startKey],[endKey]...]] [ ...]"); + System.err.println("Usage: hbase org.apache.hadoop.hbase.mapreduce.RowCounter [options] " + + " [--starttime= --endtime=] " + + "[--range=[startKey],[endKey][;[startKey],[endKey]...]] [ ...]"); System.err.println("For performance consider the following options:\n" + "-Dhbase.client.scanner.caching=100\n" + "-Dmapreduce.map.speculative=false"); diff --git a/src/main/asciidoc/_chapters/ops_mgt.adoc b/src/main/asciidoc/_chapters/ops_mgt.adoc index 816773ec0c..6e6bc512a4 100644 --- a/src/main/asciidoc/_chapters/ops_mgt.adoc +++ b/src/main/asciidoc/_chapters/ops_mgt.adoc @@ -709,24 +709,27 @@ WALPlayer, by default, runs as a mapreduce job. To NOT run WALPlayer as a mapreduce job on your cluster, force it to run all in the local process by adding the flags `-Dmapreduce.jobtracker.address=local` on the command line. [[rowcounter]] -=== RowCounter and CellCounter +=== RowCounter -link:https://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/RowCounter.html[RowCounter] is a mapreduce job to count all the rows of a table. +link:https://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/RowCounter.html[RowCounter] is a mapreduce job to count all the rows of a table. This is a good utility to use as a sanity check to ensure that HBase can read all the blocks of a table if there are any concerns of metadata inconsistency. -It will run the mapreduce all in a single process but it will run faster if you have a MapReduce cluster in place for it to exploit. It is also possible to limit -the time range of data to be scanned by using the `--starttime=[starttime]` and `--endtime=[endtime]` flags. +It will run the mapreduce all in a single process but it will run faster if you have a MapReduce cluster in place for it to exploit. +It is possible to limit the time range of data to be scanned by using the `--starttime=[starttime]` and `--endtime=[endtime]` flags. +The scanned data can be limited based on keys using the `--range=[startKey],[endKey][;[startKey],[endKey]...]` option. ---- -$ bin/hbase org.apache.hadoop.hbase.mapreduce.RowCounter [ ...] +$ bin/hbase org.apache.hadoop.hbase.mapreduce.RowCounter [options] [--starttime= --endtime=] [--range=[startKey],[endKey][;[startKey],[endKey]...]] [ ...] ---- RowCounter only counts one version per cell. -Note: caching for the input Scan is configured via `hbase.client.scanner.caching` in the job configuration. +For performance consider to use `-Dhbase.client.scanner.caching=100` and `-Dmapreduce.map.speculative=false` options. + +=== CellCounter HBase ships another diagnostic mapreduce job called link:https://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/CellCounter.html[CellCounter]. Like RowCounter, it gathers more fine-grained statistics about your table. -The statistics gathered by RowCounter are more fine-grained and include: +The statistics gathered by CellCounter are more fine-grained and include: * Total number of rows in the table. * Total number of CFs across all rows. @@ -737,12 +740,12 @@ The statistics gathered by RowCounter are more fine-grained and include: The program allows you to limit the scope of the run. Provide a row regex or prefix to limit the rows to analyze. -Specify a time range to scan the table by using the `--starttime=[starttime]` and `--endtime=[endtime]` flags. +Specify a time range to scan the table by using the `--starttime=` and `--endtime=` flags. Use `hbase.mapreduce.scan.column.family` to specify scanning a single column family. ---- -$ bin/hbase org.apache.hadoop.hbase.mapreduce.CellCounter [regex or prefix] +$ bin/hbase org.apache.hadoop.hbase.mapreduce.CellCounter [reportSeparator] [regex or prefix] [--starttime= --endtime=] ---- Note: just like RowCounter, caching for the input Scan is configured via `hbase.client.scanner.caching` in the job configuration. @@ -750,8 +753,7 @@ Note: just like RowCounter, caching for the input Scan is configured via `hbase. === mlockall It is possible to optionally pin your servers in physical memory making them less likely to be swapped out in oversubscribed environments by having the servers call link:http://linux.die.net/man/2/mlockall[mlockall] on startup. -See link:https://issues.apache.org/jira/browse/HBASE-4391[HBASE-4391 Add ability to - start RS as root and call mlockall] for how to build the optional library and have it run on startup. +See link:https://issues.apache.org/jira/browse/HBASE-4391[HBASE-4391 Add ability to start RS as root and call mlockall] for how to build the optional library and have it run on startup. [[compaction.tool]] === Offline Compaction Tool -- 2.16.2