diff --git a/src/assembly/bin.xml b/src/assembly/bin.xml index a1964c7..95227c9 100644 --- a/src/assembly/bin.xml +++ b/src/assembly/bin.xml @@ -40,6 +40,17 @@ **/*.jsp + + target/site + docs + + + src/main/resources/ + conf + + hbase-default.xml + + diff --git a/src/docs/forrest.properties b/src/docs/forrest.properties deleted file mode 100644 index 3a07528..0000000 --- a/src/docs/forrest.properties +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright 2002-2004 The Apache Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -############## -# Properties used by forrest.build.xml for building the website -# These are the defaults, un-comment them if you need to change them. -############## - -# Prints out a summary of Forrest settings for this project -#forrest.echo=true - -# Project name (used to name .war file) -#project.name=my-project - -# Specifies name of Forrest skin to use -#project.skin=tigris -#project.skin=pelt - -# comma separated list, file:// is supported -#forrest.skins.descriptors=http://forrest.apache.org/skins/skins.xml,file:///c:/myskins/skins.xml - -############## -# behavioural properties -#project.menu-scheme=tab_attributes -#project.menu-scheme=directories - -############## -# layout properties - -# Properties that can be set to override the default locations -# -# Parent properties must be set. This usually means uncommenting -# project.content-dir if any other property using it is uncommented - -#project.status=status.xml -#project.content-dir=src/documentation -#project.raw-content-dir=${project.content-dir}/content -#project.conf-dir=${project.content-dir}/conf -#project.sitemap-dir=${project.content-dir} -#project.xdocs-dir=${project.content-dir}/content/xdocs -#project.resources-dir=${project.content-dir}/resources -#project.stylesheets-dir=${project.resources-dir}/stylesheets -#project.images-dir=${project.resources-dir}/images -#project.schema-dir=${project.resources-dir}/schema -#project.skins-dir=${project.content-dir}/skins -#project.skinconf=${project.content-dir}/skinconf.xml -#project.lib-dir=${project.content-dir}/lib -#project.classes-dir=${project.content-dir}/classes -#project.translations-dir=${project.content-dir}/translations - -############## -# validation properties - -# This set of properties determine if validation is performed -# Values are inherited unless overridden. -# e.g. if forrest.validate=false then all others are false unless set to true. -forrest.validate=false -#forrest.validate.xdocs=${forrest.validate} -#forrest.validate.skinconf=${forrest.validate} -#forrest.validate.sitemap=${forrest.validate} -#forrest.validate.stylesheets=${forrest.validate} -#forrest.validate.skins=${forrest.validate} -#forrest.validate.skins.stylesheets=${forrest.validate.skins} - -# *.failonerror=(true|false) - stop when an XML file is invalid -#forrest.validate.failonerror=true - -# *.excludes=(pattern) - comma-separated list of path patterns to not validate -# e.g. -#forrest.validate.xdocs.excludes=samples/subdir/**, samples/faq.xml -#forrest.validate.xdocs.excludes= - - -############## -# General Forrest properties - -# The URL to start crawling from -#project.start-uri=linkmap.html -# Set logging level for messages printed to the console -# (DEBUG, INFO, WARN, ERROR, FATAL_ERROR) -#project.debuglevel=ERROR -# Max memory to allocate to Java -#forrest.maxmemory=64m -# Any other arguments to pass to the JVM. For example, to run on an X-less -# server, set to -Djava.awt.headless=true -#forrest.jvmargs= -# The bugtracking URL - the issue number will be appended -#project.bugtracking-url=http://issues.apache.org/bugzilla/show_bug.cgi?id= -#project.bugtracking-url=http://issues.apache.org/jira/browse/ -# The issues list as rss -#project.issues-rss-url= -#I18n Property only works for the "forrest run" target. -#project.i18n=true diff --git a/src/docs/src/documentation/README.txt b/src/docs/src/documentation/README.txt deleted file mode 100644 index 9bc261b..0000000 --- a/src/docs/src/documentation/README.txt +++ /dev/null @@ -1,7 +0,0 @@ -This is the base documentation directory. - -skinconf.xml # This file customizes Forrest for your project. In it, you - # tell forrest the project name, logo, copyright info, etc - -sitemap.xmap # Optional. This sitemap is consulted before all core sitemaps. - # See http://forrest.apache.org/docs/project-sitemap.html diff --git a/src/docs/src/documentation/classes/CatalogManager.properties b/src/docs/src/documentation/classes/CatalogManager.properties deleted file mode 100644 index ac060b9..0000000 --- a/src/docs/src/documentation/classes/CatalogManager.properties +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright 2002-2004 The Apache Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#======================================================================= -# CatalogManager.properties -# -# This is the default properties file for Apache Forrest. -# This facilitates local configuration of application-specific catalogs. -# -# See the Apache Forrest documentation: -# http://forrest.apache.org/docs/your-project.html -# http://forrest.apache.org/docs/validation.html - -# verbosity ... level of messages for status/debug -# See forrest/src/core/context/WEB-INF/cocoon.xconf - -# catalogs ... list of additional catalogs to load -# (Note that Apache Forrest will automatically load its own default catalog -# from src/core/context/resources/schema/catalog.xcat) -# use full pathnames -# pathname separator is always semi-colon (;) regardless of operating system -# directory separator is always slash (/) regardless of operating system -# -#catalogs=/home/me/forrest/my-site/src/documentation/resources/schema/catalog.xcat -catalogs= - diff --git a/src/docs/src/documentation/content/xdocs/acid-semantics.xml b/src/docs/src/documentation/content/xdocs/acid-semantics.xml deleted file mode 100644 index 83bf3df..0000000 --- a/src/docs/src/documentation/content/xdocs/acid-semantics.xml +++ /dev/null @@ -1,227 +0,0 @@ - - - - - - - - -
- - HBase ACID Properties - -
- - -
- About this Document -

HBase is not an ACID compliant database. However, it does guarantee certain specific - properties.

-

This specification enumerates the ACID properties of HBase.

-
-
- Definitions -

For the sake of common vocabulary, we define the following terms:

-
-
Atomicity
-
an operation is atomic if it either completes entirely or not at all
- -
Consistency
-
- all actions cause the table to transition from one valid state directly to another - (eg a row will not disappear during an update, etc) -
- -
Isolation
-
- an operation is isolated if it appears to complete independently of any other concurrent transaction -
- -
Durability
-
any update that reports "successful" to the client will not be lost
- -
Visibility
-
an update is considered visible if any subsequent read will see the update as having been committed
-
-

- The terms must and may are used as specified by RFC 2119. - In short, the word "must" implies that, if some case exists where the statement - is not true, it is a bug. The word "may" implies that, even if the guarantee - is provided in a current release, users should not rely on it. -

-
-
- APIs to consider -
    -
  • Read APIs -
      -
    • get
    • -
    • scan
    • -
    -
  • -
  • Write APIs
  • -
      -
    • put
    • -
    • batch put
    • -
    • delete
    • -
    -
  • Combination (read-modify-write) APIs
  • -
      -
    • incrementColumnValue
    • -
    • checkAndPut
    • -
    -
-
- -
- Guarantees Provided - -
- Atomicity - -
    -
  1. All mutations are atomic within a row. Any put will either wholely succeed or wholely fail.
  2. -
      -
    1. An operation that returns a "success" code has completely succeeded.
    2. -
    3. An operation that returns a "failure" code has completely failed.
    4. -
    5. An operation that times out may have succeeded and may have failed. However, - it will not have partially succeeded or failed.
    6. -
    -
  3. This is true even if the mutation crosses multiple column families within a row.
  4. -
  5. APIs that mutate several rows will _not_ be atomic across the multiple rows. - For example, a multiput that operates on rows 'a','b', and 'c' may return having - mutated some but not all of the rows. In such cases, these APIs will return a list - of success codes, each of which may be succeeded, failed, or timed out as described above.
  6. -
  7. The checkAndPut API happens atomically like the typical compareAndSet (CAS) operation - found in many hardware architectures.
  8. -
  9. The order of mutations is seen to happen in a well-defined order for each row, with no - interleaving. For example, if one writer issues the mutation "a=1,b=1,c=1" and - another writer issues the mutation "a=2,b=2,c=2", the row must either - be "a=1,b=1,c=1" or "a=2,b=2,c=2" and must not be something - like "a=1,b=2,c=1".
  10. -
      -
    1. Please note that this is not true _across rows_ for multirow batch mutations.
    2. -
    -
-
-
- Consistency and Isolation -
    -
  1. All rows returned via any access API will consist of a complete row that existed at - some point in the table's history.
  2. -
  3. This is true across column families - i.e a get of a full row that occurs concurrent - with some mutations 1,2,3,4,5 will return a complete row that existed at some point in time - between mutation i and i+1 for some i between 1 and 5.
  4. -
  5. The state of a row will only move forward through the history of edits to it.
  6. -
- -
Consistency of Scans -

- A scan is not a consistent view of a table. Scans do - not exhibit snapshot isolation. -

-

- Rather, scans have the following properties: -

- -
    -
  1. - Any row returned by the scan will be a consistent view (i.e. that version - of the complete row existed at some point in time) -
  2. -
  3. - A scan will always reflect a view of the data at least as new as - the beginning of the scan. This satisfies the visibility guarantees - enumerated below.
  4. -
      -
    1. For example, if client A writes data X and then communicates via a side - channel to client B, any scans started by client B will contain data at least - as new as X.
    2. -
    3. A scan _must_ reflect all mutations committed prior to the construction - of the scanner, and _may_ reflect some mutations committed subsequent to the - construction of the scanner.
    4. -
    5. Scans must include all data written prior to the scan (except in - the case where data is subsequently mutated, in which case it _may_ reflect - the mutation)
    6. -
    -
-

- Those familiar with relational databases will recognize this isolation level as "read committed". -

-

- Please note that the guarantees listed above regarding scanner consistency - are referring to "transaction commit time", not the "timestamp" - field of each cell. That is to say, a scanner started at time t may see edits - with a timestamp value greater than t, if those edits were committed with a - "forward dated" timestamp before the scanner was constructed. -

-
-
-
- Visibility -
    -
  1. When a client receives a "success" response for any mutation, that - mutation is immediately visible to both that client and any client with whom it - later communicates through side channels.
  2. -
  3. A row must never exhibit so-called "time-travel" properties. That - is to say, if a series of mutations moves a row sequentially through a series of - states, any sequence of concurrent reads will return a subsequence of those states.
  4. -
      -
    1. For example, if a row's cells are mutated using the "incrementColumnValue" - API, a client must never see the value of any cell decrease.
    2. -
    3. This is true regardless of which read API is used to read back the mutation.
    4. -
    -
  5. Any version of a cell that has been returned to a read operation is guaranteed to - be durably stored.
  6. -
- -
-
- Durability -
    -
  1. All visible data is also durable data. That is to say, a read will never return - data that has not been made durable on disk[1]
  2. -
  3. Any operation that returns a "success" code (eg does not throw an exception) - will be made durable.
  4. -
  5. Any operation that returns a "failure" code will not be made durable - (subject to the Atomicity guarantees above)
  6. -
  7. All reasonable failure scenarios will not affect any of the guarantees of this document.
  8. - -
-
-
- Tunability -

All of the above guarantees must be possible within HBase. For users who would like to trade - off some guarantees for performance, HBase may offer several tuning options. For example:

-
    -
  • Visibility may be tuned on a per-read basis to allow stale reads or time travel.
  • -
  • Durability may be tuned to only flush data to disk on a periodic basis
  • -
-
-
-
- Footnotes - -

[1] In the context of HBase, "durably on disk" implies an hflush() call on the transaction - log. This does not actually imply an fsync() to magnetic media, but rather just that the data has been - written to the OS cache on all replicas of the log. In the case of a full datacenter power loss, it is - possible that the edits are not truly durable.

-
- - -
diff --git a/src/docs/src/documentation/content/xdocs/bulk-loads.xml b/src/docs/src/documentation/content/xdocs/bulk-loads.xml deleted file mode 100644 index fc61ebe..0000000 --- a/src/docs/src/documentation/content/xdocs/bulk-loads.xml +++ /dev/null @@ -1,148 +0,0 @@ - - - - - - - - -
- - Bulk Loads in HBase - -
- -
- Overview -

- HBase includes several methods of loading data into tables. - The most straightforward method is to either use the TableOutputFormat - class from a MapReduce job, or use the normal client APIs; however, - these are not always the most efficient methods. -

-

- This document describes HBase's bulk load functionality. The bulk load - feature uses a MapReduce job to output table data in HBase's internal - data format, and then directly loads the data files into a running - cluster. -

-
-
- Bulk Load Architecture -

- The HBase bulk load process consists of two main steps. -

- - Preparing data via a MapReduce job -

- The first step of a bulk load is to generate HBase data files from - a MapReduce job using HFileOutputFormat. This output format writes - out data in HBase's internal storage format so that they can be - later loaded very efficiently into the cluster. -

-

- In order to function efficiently, HFileOutputFormat must be configured - such that each output HFile fits within a single region. In order to - do this, jobs use Hadoop's TotalOrderPartitioner class to partition the - map output into disjoint ranges of the key space, corresponding to the - key ranges of the regions in the table. -

-

- HFileOutputFormat includes a convenience function, configureIncrementalLoad(), - which automatically sets up a TotalOrderPartitioner based on the current - region boundaries of a table. -

-
- - Completing the data load -

- After the data has been prepared using HFileOutputFormat, it - is loaded into the cluster using a command line tool. This command line tool - iterates through the prepared data files, and for each one determines the - region the file belongs to. It then contacts the appropriate Region Server - which adopts the HFile, moving it into its storage directory and making - the data available to clients. -

-

- If the region boundaries have changed during the course of bulk load - preparation, or between the preparation and completion steps, the bulk - load commandline utility will automatically split the data files into - pieces corresponding to the new boundaries. This process is not - optimally efficient, so users should take care to minimize the delay between - preparing a bulk load and importing it into the cluster, especially - if other clients are simultaneously loading data through other means. -

-
-
-
- Preparing a bulk load using the <code>importtsv</code> tool -

- HBase ships with a command line tool called importtsv. This tool - is available by running hadoop jar /path/to/hbase-VERSION.jar importtsv. - Running this tool with no arguments prints brief usage information: -

-
-Usage: importtsv -Dimporttsv.columns=a,b,c <tablename> <inputdir>
-
-Imports the given input directory of TSV data into the specified table.
-
-The column names of the TSV data must be specified using the -Dimporttsv.columns
-option. This option takes the form of comma-separated column names, where each
-column name is either a simple column family, or a columnfamily:qualifier. The special
-column name HBASE_ROW_KEY is used to designate that this column should be used
-as the row key for each imported record. You must specify exactly one column
-to be the row key.
-
-In order to prepare data for a bulk data load, pass the option:
-  -Dimporttsv.bulk.output=/path/for/output
-
-Other options that may be specified with -D include:
-  -Dimporttsv.skip.bad.lines=false - fail if encountering an invalid line
-
-
-
- Importing the prepared data using the <code>completebulkload</code> tool -

- After a data import has been prepared using the importtsv tool, the - completebulkload tool is used to import the data into the running cluster. -

-

- The completebulkload tool simply takes the same output path where - importtsv put its results, and the table name. For example: -

- $ hadoop jar hbase-VERSION.jar completebulkload /user/todd/myoutput mytable -

- This tool will run quickly, after which point the new data will be visible in - the cluster. -

-
-
- Advanced Usage -

- Although the importtsv tool is useful in many cases, advanced users may - want to generate data programatically, or import data from other formats. To get - started doing so, dig into ImportTsv.java and check the JavaDoc for - HFileOutputFormat. -

-

- The import step of the bulk load can also be done programatically. See the - LoadIncrementalHFiles class for more information. -

-
- -
\ No newline at end of file diff --git a/src/docs/src/documentation/content/xdocs/cygwin.xml b/src/docs/src/documentation/content/xdocs/cygwin.xml deleted file mode 100644 index 5630cff..0000000 --- a/src/docs/src/documentation/content/xdocs/cygwin.xml +++ /dev/null @@ -1,254 +0,0 @@ - - - - - - -
- Installing HBase on Windows using Cygwin -
- - -
-Introduction -

HBase is a distributed, column-oriented store, modeled after Google's BigTable. HBase is built on top of Hadoop for its MapReduce and distributed file system implementation. All these projects are open-source and part of the Apache Software Foundation.

- -

As being distributed, large scale platforms, the Hadoop and HBase projects mainly focus on *nix environments for production installations. However, being developed in Java, both projects are fully portable across platforms and, hence, also to the Windows operating system. For ease of development the projects rely on Cygwin to have a *nix-like environment on Windows to run the shell scripts.

-
-
-Purpose -

This document explains the intricacies of running HBase on Windows using Cygwin as an all-in-one single-node installation for testing and development. The HBase Overview and QuickStart guides on the other hand go a long way in explaning how to setup HBase in more complex deployment scenario's.

-
- -
-Installation -

For running HBase on Windows, 3 technologies are required: Java, Cygwin and SSH. The following paragraphs detail the installation of each of the aforementioned technologies.

-
-Java -

HBase depends on the Java Platform, Standard Edition, 6 Release. So the target system has to be provided with at least the Java Runtime Environment (JRE); however if the system will also be used for development, the Jave Development Kit (JDK) is preferred. You can download the latest versions for both from Sun's download page. Installation is a simple GUI wizard that guides you through the process.

-
-
-Cygwin -

Cygwin is probably the oddest technology in this solution stack. It provides a dynamic link library that emulates most of a *nix environment on Windows. On top of that a whole bunch of the most common *nix tools are supplied. Combined, the DLL with the tools form a very *nix-alike environment on Windows.

- -

For installation, Cygwin provides the setup.exe utility that tracks the versions of all installed components on the target system and provides the mechanism for installing or updating everything from the mirror sites of Cygwin.

- -

To support installation, the setup.exe utility uses 2 directories on the target system. The Root directory for Cygwin (defaults to C:\cygwin) which will become / within the eventual Cygwin installation; and the Local Package directory (e.g. C:\cygsetup that is the cache where setup.exe stores the packages before they are installed. The cache must not be the same folder as the Cygwin root.

- -

Perform following steps to install Cygwin, which are elaboratly detailed in the 2nd chapter of the Cygwin User's Guide:

- -
    -
  1. Make sure you have Administrator privileges on the target system.
  2. -
  3. Choose and create you Root and Local Package directories. A good suggestion is to use C:\cygwin\root and C:\cygwin\setup folders.
  4. -
  5. Download the setup.exe utility and save it to the Local Package directory.
  6. -
  7. Run the setup.exe utility, -
      -
    1. Choose the Install from Internet option,
    2. -
    3. Choose your Root and Local Package folders
    4. -
    5. and select an appropriate mirror.
    6. -
    7. Don't select any additional packages yet, as we only want to install Cygwin for now.
    8. -
    9. Wait for download and install
    10. -
    11. Finish the installation
    12. -
    -
  8. -
  9. Optionally, you can now also add a shortcut to your Start menu pointing to the setup.exe utility in the Local Package folder.
  10. -
  11. Add CYGWIN_HOME system-wide environment variable that points to your Root directory.
  12. -
  13. Add %CYGWIN_HOME%\bin to the end of your PATH environment variable.
  14. -
  15. Reboot the sytem after making changes to the environment variables otherwise the OS will not be able to find the Cygwin utilities.
  16. -
  17. Test your installation by running your freshly created shortcuts or the Cygwin.bat command in the Root folder. You should end up in a terminal window that is running a Bash shell. Test the shell by issuing following commands: -
      -
    1. cd / should take you to thr Root directory in Cygwin;
    2. -
    3. the LS commands that should list all files and folders in the current directory.
    4. -
    5. Use the exit command to end the terminal.
    6. -
    -
  18. -
  19. When needed, to uninstall Cygwin you can simply delete the Root and Local Package directory, and the shortcuts that were created during installation.
  20. -
-
-
SSH -

HBase (and Hadoop) rely on SSH for interprocess/-node communication and launching remote commands. SSH will be provisioned on the target system via Cygwin, which supports running Cygwin programs as Windows services!

- -
    -
  1. Rerun the setup.exe utility.
  2. -
  3. Leave all parameters as is, skipping through the wizard using the Next button until the Select Packages panel is shown.
  4. -
  5. Maximize the window and click the View button to toggle to the list view, which is ordered alfabetically on Package, making it easier to find the packages we'll need.
  6. -
  7. Select the following packages by clicking the status word (normally Skip) so it's marked for installation. Use the Next button to download and install the packages. -
      -
    1. OpenSSH
    2. -
    3. tcp_wrappers
    4. -
    5. diffutils
    6. -
    7. zlib
    8. -
    -
  8. -
  9. Wait for the install to complete and finish the installation.
  10. -
-
-
-HBase -

Download the latest release of HBase from the website. As the HBase distributable is just a zipped archive, installation is as simple as unpacking the archive so it ends up in its final installation directory. Notice that HBase has to be installed in Cygwin and a good directory suggestion is to use /usr/local/ (or [Root directory]\usr\local in Windows slang). You should end up with a /usr/local/hbase-<version> installation in Cygwin.

- -This finishes installation. We go on with the configuration. -
-
-
-Configuration -

There are 3 parts left to configure: Java, SSH and HBase itself. Following paragraphs explain eacht topic in detail.

-
-Java -

One important thing to remember in shell scripting in general (i.e. *nix and Windows) is that managing, manipulating and assembling path names that contains spaces can be very hard, due to the need to escape and quote those characters and strings. So we try to stay away from spaces in path names. *nix environments can help us out here very easily by using symbolic links.

- -
    -
  1. Create a link in /usr/local to the Java home directory by using the following command and substituting the name of your chosen Java environment: -
    LN -s /cygdrive/c/Program\ Files/Java/<jre name> /usr/local/<jre name>
    -
  2. -
  3. Test your java installation by changing directories to your Java folder CD /usr/local/<jre name> and issueing the command ./bin/java -version. This should output your version of the chosen JRE.
  4. -
-
-
-SSH -

Configuring SSH is quite elaborate, but primarily a question of launching it by default as a Windows service.

- -
    -
  1. On Windows Vista and above make sure you run the Cygwin shell with elevated privileges, by right-clicking on the shortcut an using Run as Administrator.
  2. -
  3. First of all, we have to make sure the rights on some crucial files are correct. Use the commands underneath. You can verify all rights by using the LS -L command on the different files. Also, notice the auto-completion feature in the shell using <TAB> is extremely handy in these situations. -
      -
    1. chmod +r /etc/passwd to make the passwords file readable for all
    2. -
    3. chmod u+w /etc/passwd to make the passwords file writable for the owner
    4. -
    5. chmod +r /etc/group to make the groups file readable for all
    6. -
    -
      -
    1. chmod u+w /etc/group to make the groups file writable for the owner
    2. -
    -
      -
    1. chmod 755 /var to make the var folder writable to owner and readable and executable to all
    2. -
    -
  4. -
  5. Edit the /etc/hosts.allow file using your favorite editor (why not VI in the shell!) and make sure the following two lines are in there before the PARANOID line: -
      -
    1. ALL : localhost 127.0.0.1/32 : allow
    2. -
    3. ALL : [::1]/128 : allow
    4. -
    -
  6. -
  7. Next we have to configure SSH by using the script ssh-host-config -
      -
    1. If this script asks to overwrite an existing /etc/ssh_config, answer yes.
    2. -
    3. If this script asks to overwrite an existing /etc/sshd_config, answer yes.
    4. -
    5. If this script asks to use privilege separation, answer yes.
    6. -
    7. If this script asks to install sshd as a service, answer yes. Make sure you started your shell as Adminstrator!
    8. -
    9. If this script asks for the CYGWIN value, just <enter> as the default is ntsec.
    10. -
    11. If this script asks to create the sshd account, answer yes.
    12. -
    13. If this script asks to use a different user name as service account, answer no as the default will suffice.
    14. -
    15. If this script asks to create the cyg_server account, answer yes. Enter a password for the account.
    16. -
    -
  8. -
  9. Start the SSH service using net start sshd or cygrunsrv --start sshd. Notice that cygrunsrv is the utility that make the process run as a Windows service. Confirm that you see a message stating that the CYGWIN sshd service was started succesfully.
  10. -
  11. Harmonize Windows and Cygwin user account by using the commands: -
      -
    1. mkpasswd -cl > /etc/passwd
    2. -
    3. mkgroup --local > /etc/group
    4. -
    -
  12. -
  13. Test the installation of SSH: -
      -
    1. Open a new Cygwin terminal
    2. -
    3. Use the command whoami to verify your userID
    4. -
    5. Issue an ssh localhost to connect to the system itself -
        -
      1. Answer yes when presented with the server's fingerprint
      2. -
      3. Issue your password when prompted
      4. -
      5. test a few commands in the remote session
      6. -
      7. The exit command should take you back to your first shell in Cygwin
      8. -
      -
    6. -
    7. Exit should terminate the Cygwin shell.
    8. -
    -
  14. -
-
-
-HBase -If all previous configurations are working properly, we just need some tinkering at the HBase config files to properly resolve on Windows/Cygwin. All files and paths referenced here start from the HBase [installation directory] as working directory. -
    -
  1. HBase uses the ./conf/hbase-env.sh to configure its dependencies on the runtime environment. Copy and uncomment following lines just underneath their original, change them to fit your environemnt. They should read something like: -
      -
    1. export JAVA_HOME=/usr/local/<jre name>
    2. -
    3. export HBASE_IDENT_STRING=$HOSTNAME as this most likely does not inlcude spaces.
    4. -
    -
  2. -
  3. HBase uses the ./conf/hbase-default.xml file for configuration. Some properties do not resolve to existing directories because the JVM runs on Windows. This is the major issue to keep in mind when working with Cygwin: within the shell all paths are *nix-alike, hence relative to the root /. However, every parameter that is to be consumed within the windows processes themself, need to be Windows settings, hence C:\-alike. Change following propeties in the configuration file, adjusting paths where necessary to conform with your own installation: -
      -
    1. hbase.rootdir must read e.g. file:///C:/cygwin/root/tmp/hbase/data
    2. -
    3. hbase.tmp.dir must read C:/cygwin/root/tmp/hbase/tmp
    4. -
    5. hbase.zookeeper.quorum must read 127.0.0.1 because for some reason localhost doesn't seem to resolve properly on Cygwin.
    6. -
    -
  4. -
  5. Make sure the configured hbase.rootdir and hbase.tmp.dir directories exist and have the proper rights set up e.g. by issuing a chmod 777 on them.
  6. -
-
-
-
-Testing -

-This should conclude the installation and configuration of HBase on Windows using Cygwin. So it's time to test it. -

    -
  1. Start a Cygwin terminal, if you haven't already.
  2. -
  3. Change directory to HBase installation using CD /usr/local/hbase-<version>, preferably using auto-completion.
  4. -
  5. Start HBase using the command ./bin/start-hbase.sh -
      -
    1. When prompted to accept the SSH fingerprint, answer yes.
    2. -
    3. When prompted, provide your password. Maybe multiple times.
    4. -
    5. When the command completes, the HBase server should have started.
    6. -
    7. However, to be absolutely certain, check the logs in the ./logs directory for any exceptions.
    8. -
    -
  6. -
  7. Next we start the HBase shell using the command ./bin/hbase shell
  8. -
  9. We run some simple test commands -
      -
    1. Create a simple table using command create 'test', 'data'
    2. -
    3. Verify the table exists using the command list
    4. -
    5. Insert data into the table using e.g. -
      put 'test', 'row1', 'data:1', 'value1'
      -put 'test', 'row2', 'data:2', 'value2'
      -put 'test', 'row3', 'data:3', 'value3'
      -
    6. -
    7. List all rows in the table using the command scan 'test' that should list all the rows previously inserted. Notice how 3 new columns where added without changing the schema!
    8. -
    9. Finally we get rid of the table by issuing disable 'test' followed by drop 'test' and verified by list which should give an empty listing.
    10. -
    -
  10. -
  11. Leave the shell by exit
  12. -
  13. To stop the HBase server issue the ./bin/stop-hbase.sh command. And wait for it to complete!!! Killing the process might corrupt your data on disk.
  14. -
  15. In case of problems, -
      -
    1. verify the HBase logs in the ./logs directory.
    2. -
    3. Try to fix the problem
    4. -
    5. Get help on the forums or IRC (#hbase@freenode.net). People are very active and keen to help out!
    6. -
    7. Stopr, restart and retest the server.
    8. -
    -
  16. -
-

-
- -
-Conclusion -

-Now your HBase server is running, start coding and build that next killer app on this particular, but scalable datastore! -

-
- -
diff --git a/src/docs/src/documentation/content/xdocs/index.xml b/src/docs/src/documentation/content/xdocs/index.xml deleted file mode 100644 index 2b25a89..0000000 --- a/src/docs/src/documentation/content/xdocs/index.xml +++ /dev/null @@ -1,40 +0,0 @@ - - - - - - - -
- HBase Documentation -
- - -

- The following documents provide concepts and procedures that will help you - get started using HBase. If you have more questions, you can ask the - mailing list or browse the archives. -

- - - -
diff --git a/src/docs/src/documentation/content/xdocs/metrics.xml b/src/docs/src/documentation/content/xdocs/metrics.xml deleted file mode 100644 index 83389e1..0000000 --- a/src/docs/src/documentation/content/xdocs/metrics.xml +++ /dev/null @@ -1,152 +0,0 @@ - - - - - - - - -
- - HBase Metrics - -
- - -
- Introduction -

- HBase emits Hadoop metrics. -

-
-
- HOWTO -

First read up on Hadoop metrics. - If you are using ganglia, the GangliaMetrics - wiki page is useful read.

-

To have HBase emit metrics, edit $HBASE_HOME/conf/hadoop-metrics.properties - and enable metric 'contexts' per plugin. As of this writing, hadoop supports - file and ganglia plugins. - Yes, the hbase metrics files is named hadoop-metrics rather than - hbase-metrics because currently at least the hadoop metrics system has the - properties filename hardcoded. Per metrics context, - comment out the NullContext and enable one or more plugins instead. -

-

- If you enable the hbase context, on regionservers you'll see total requests since last - metric emission, count of regions and storefiles as well as a count of memstore size. - On the master, you'll see a count of the cluster's requests. -

-

- Enabling the rpc context is good if you are interested in seeing - metrics on each hbase rpc method invocation (counts and time taken). -

-

- The jvm context is - useful for long-term stats on running hbase jvms -- memory used, thread counts, etc. - As of this writing, if more than one jvm is running emitting metrics, at least - in ganglia, the stats are aggregated rather than reported per instance. -

-
- -
- Using with JMX -

- In addition to the standard output contexts supported by the Hadoop - metrics package, you can also export HBase metrics via Java Management - Extensions (JMX). This will allow viewing HBase stats in JConsole or - any other JMX client. -

-
- Enable HBase stats collection -

- To enable JMX support in HBase, first edit - $HBASE_HOME/conf/hadoop-metrics.properties to support - metrics refreshing. (If you've already configured - hadoop-metrics.properties for another output context, - you can skip this step). -

- -# Configuration of the "hbase" context for null -hbase.class=org.apache.hadoop.metrics.spi.NullContextWithUpdateThread -hbase.period=60 - -# Configuration of the "jvm" context for null -jvm.class=org.apache.hadoop.metrics.spi.NullContextWithUpdateThread -jvm.period=60 - -# Configuration of the "rpc" context for null -rpc.class=org.apache.hadoop.metrics.spi.NullContextWithUpdateThread -rpc.period=60 - -
-
- Setup JMX remote access -

- For remote access, you will need to configure JMX remote passwords - and access profiles. Create the files: -

-
-
$HBASE_HOME/conf/jmxremote.passwd (set permissions - to 600)
-
- -monitorRole monitorpass -controlRole controlpass - -
- -
$HBASE_HOME/conf/jmxremote.access
-
- -monitorRole readonly -controlRole readwrite - -
-
-
-
- Configure JMX in HBase startup -

- Finally, edit the $HBASE_HOME/conf/hbase-env.sh - script to add JMX support: -

-
-
$HBASE_HOME/conf/hbase-env.sh
-
-

Add the lines:

- -HBASE_JMX_OPTS="-Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.ssl=false" -HBASE_JMX_OPTS="$HBASE_JMX_OPTS -Dcom.sun.management.jmxremote.password.file=$HBASE_HOME/conf/jmxremote.passwd" -HBASE_JMX_OPTS="$HBASE_JMX_OPTS -Dcom.sun.management.jmxremote.access.file=$HBASE_HOME/conf/jmxremote.access" - -export HBASE_MASTER_OPTS="$HBASE_JMX_OPTS -Dcom.sun.management.jmxremote.port=10101" -export HBASE_REGIONSERVER_OPTS="$HBASE_JMX_OPTS -Dcom.sun.management.jmxremote.port=10102" - -
-
-

- After restarting the processes you want to monitor, you should now be - able to run JConsole (included with the JDK since JDK 5.0) to view - the statistics via JMX. HBase MBeans are exported under the - hadoop domain in JMX. -

-
-
- -
diff --git a/src/docs/src/documentation/content/xdocs/site.xml b/src/docs/src/documentation/content/xdocs/site.xml deleted file mode 100644 index 6a42647..0000000 --- a/src/docs/src/documentation/content/xdocs/site.xml +++ /dev/null @@ -1,76 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/src/docs/src/documentation/content/xdocs/tabs.xml b/src/docs/src/documentation/content/xdocs/tabs.xml deleted file mode 100644 index 5af2a23..0000000 --- a/src/docs/src/documentation/content/xdocs/tabs.xml +++ /dev/null @@ -1,35 +0,0 @@ - - - - - - - - - - - - - diff --git a/src/docs/src/documentation/resources/images/architecture.gif b/src/docs/src/documentation/resources/images/architecture.gif deleted file mode 100644 index 8d84a23..0000000 Binary files a/src/docs/src/documentation/resources/images/architecture.gif and /dev/null differ diff --git a/src/docs/src/documentation/resources/images/favicon.ico b/src/docs/src/documentation/resources/images/favicon.ico deleted file mode 100644 index 161bcf7..0000000 Binary files a/src/docs/src/documentation/resources/images/favicon.ico and /dev/null differ diff --git a/src/docs/src/documentation/resources/images/hadoop-logo.jpg b/src/docs/src/documentation/resources/images/hadoop-logo.jpg deleted file mode 100644 index 809525d..0000000 Binary files a/src/docs/src/documentation/resources/images/hadoop-logo.jpg and /dev/null differ diff --git a/src/docs/src/documentation/resources/images/hbase_logo_med.gif b/src/docs/src/documentation/resources/images/hbase_logo_med.gif deleted file mode 100644 index 36d3e3c..0000000 Binary files a/src/docs/src/documentation/resources/images/hbase_logo_med.gif and /dev/null differ diff --git a/src/docs/src/documentation/resources/images/hbase_small.gif b/src/docs/src/documentation/resources/images/hbase_small.gif deleted file mode 100644 index 3275765..0000000 Binary files a/src/docs/src/documentation/resources/images/hbase_small.gif and /dev/null differ diff --git a/src/docs/src/documentation/skinconf.xml b/src/docs/src/documentation/skinconf.xml deleted file mode 100644 index 04a8ce8..0000000 --- a/src/docs/src/documentation/skinconf.xml +++ /dev/null @@ -1,345 +0,0 @@ - - - - - - - - - - - - true - - false - - true - - true - - - true - - - true - - - true - - - false - - - true - - - HBase - The Hadoop database - http://hadoop.apache.org/hbase/ - images/hbase_small.gif - - - Hadoop - Apache Hadoop - http://hadoop.apache.org/ - images/hadoop-logo.jpg - - - - - - - images/favicon.ico - - - 2009 - The Apache Software Foundation. - http://www.apache.org/licenses/ - - - - - - - - - - - - - - - - - - - p.quote { - margin-left: 2em; - padding: .5em; - background-color: #f0f0f0; - font-family: monospace; - } - - - - - - - - - - - - - - - - - - - - - - - - - 1in - 1in - 1.25in - 1in - - - - false - - - false - - - - - - Built with Apache Forrest - http://forrest.apache.org/ - images/built-with-forrest-button.png - 88 - 31 - - - - - - diff --git a/src/docs/status.xml b/src/docs/status.xml deleted file mode 100644 index 3ac3fda..0000000 --- a/src/docs/status.xml +++ /dev/null @@ -1,74 +0,0 @@ - - - - - - - - - - - - - - - - Initial Import - - - - - - - - - Customize this template project with your project's details. This - TODO list is generated from 'status.xml'. - - - Add lots of content. XML content goes in - src/documentation/content/xdocs, or wherever the - ${project.xdocs-dir} property (set in - forrest.properties) points. - - - Mail forrest-dev@xml.apache.org - with feedback. - - - - - - diff --git a/src/site/resources/css/site.css b/src/site/resources/css/site.css new file mode 100644 index 0000000..a88f052 --- /dev/null +++ b/src/site/resources/css/site.css @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +a.externalLink, a.externalLink:link, a.externalLink:visited, a.externalLink:active, a.externalLink:hover { + background: none; + padding-right: 0; +} + +/* +body ul { + list-style-type: square; +} +*/ + +#downloadbox { + float: right; + margin: 0 10px 20px 20px; + padding: 5px; + border: 1px solid #999; + background-color: #eee; +} + +#downloadbox h5 { + color: #000; + margin: 0; + border-bottom: 1px solid #aaaaaa; + font-size: smaller; + padding: 0; +} + +#downloadbox p { + margin-top: 1em; + margin-bottom: 0; +} + +#downloadbox ul { + margin-top: 0; + margin-bottom: 1em; + list-style-type: disc; +} + +#downloadbox li { + font-size: smaller; +} + +/* +h4 { + padding: 0; + border: none; + color: #000; + margin: 0; + font-size: larger; + font-weight: bold; +} +*/ + +#banner { + background: none; +} + +#banner img { + margin: 10px; +} + +.frontpagebox { + float: left; + text-align: center; + width: 15em; + margin-left: 0.5em; + margin-right: 0.5em; + margin-top: 2em; +} + +.headline { + font-size: 120%; + font-weight: bold; + padding-top: 1px; + padding-bottom: 5px; + background-image: url(../images/breadcrumbs.jpg); + background-repeat: repeat-x; +} + +/* +#leftColumn { + display: none !important +} + +#bodyColumn { + margin-left: 1.5em; +} +*/ + + diff --git a/src/site/resources/images/architecture.gif b/src/site/resources/images/architecture.gif new file mode 100644 index 0000000..8d84a23 Binary files /dev/null and b/src/site/resources/images/architecture.gif differ diff --git a/src/site/resources/images/favicon.ico b/src/site/resources/images/favicon.ico new file mode 100644 index 0000000..161bcf7 Binary files /dev/null and b/src/site/resources/images/favicon.ico differ diff --git a/src/site/resources/images/hadoop-logo.jpg b/src/site/resources/images/hadoop-logo.jpg new file mode 100644 index 0000000..809525d Binary files /dev/null and b/src/site/resources/images/hadoop-logo.jpg differ diff --git a/src/site/resources/images/hbase_logo_med.gif b/src/site/resources/images/hbase_logo_med.gif new file mode 100644 index 0000000..36d3e3c Binary files /dev/null and b/src/site/resources/images/hbase_logo_med.gif differ diff --git a/src/site/resources/images/hbase_small.gif b/src/site/resources/images/hbase_small.gif new file mode 100644 index 0000000..3275765 Binary files /dev/null and b/src/site/resources/images/hbase_small.gif differ diff --git a/src/site/site.xml b/src/site/site.xml new file mode 100644 index 0000000..134f199 --- /dev/null +++ b/src/site/site.xml @@ -0,0 +1,29 @@ + + + + + HBase + http://hbase.apache.org/images/hbase_logo_med.gif + http://hbase.apache.org/ + + + + + + + + + + + + + + + + + + + + diff --git a/src/site/xdoc/acid-semantics.xml b/src/site/xdoc/acid-semantics.xml new file mode 100644 index 0000000..83bf3df --- /dev/null +++ b/src/site/xdoc/acid-semantics.xml @@ -0,0 +1,227 @@ + + + + + + + + +
+ + HBase ACID Properties + +
+ + +
+ About this Document +

HBase is not an ACID compliant database. However, it does guarantee certain specific + properties.

+

This specification enumerates the ACID properties of HBase.

+
+
+ Definitions +

For the sake of common vocabulary, we define the following terms:

+
+
Atomicity
+
an operation is atomic if it either completes entirely or not at all
+ +
Consistency
+
+ all actions cause the table to transition from one valid state directly to another + (eg a row will not disappear during an update, etc) +
+ +
Isolation
+
+ an operation is isolated if it appears to complete independently of any other concurrent transaction +
+ +
Durability
+
any update that reports "successful" to the client will not be lost
+ +
Visibility
+
an update is considered visible if any subsequent read will see the update as having been committed
+
+

+ The terms must and may are used as specified by RFC 2119. + In short, the word "must" implies that, if some case exists where the statement + is not true, it is a bug. The word "may" implies that, even if the guarantee + is provided in a current release, users should not rely on it. +

+
+
+ APIs to consider +
    +
  • Read APIs +
      +
    • get
    • +
    • scan
    • +
    +
  • +
  • Write APIs
  • +
      +
    • put
    • +
    • batch put
    • +
    • delete
    • +
    +
  • Combination (read-modify-write) APIs
  • +
      +
    • incrementColumnValue
    • +
    • checkAndPut
    • +
    +
+
+ +
+ Guarantees Provided + +
+ Atomicity + +
    +
  1. All mutations are atomic within a row. Any put will either wholely succeed or wholely fail.
  2. +
      +
    1. An operation that returns a "success" code has completely succeeded.
    2. +
    3. An operation that returns a "failure" code has completely failed.
    4. +
    5. An operation that times out may have succeeded and may have failed. However, + it will not have partially succeeded or failed.
    6. +
    +
  3. This is true even if the mutation crosses multiple column families within a row.
  4. +
  5. APIs that mutate several rows will _not_ be atomic across the multiple rows. + For example, a multiput that operates on rows 'a','b', and 'c' may return having + mutated some but not all of the rows. In such cases, these APIs will return a list + of success codes, each of which may be succeeded, failed, or timed out as described above.
  6. +
  7. The checkAndPut API happens atomically like the typical compareAndSet (CAS) operation + found in many hardware architectures.
  8. +
  9. The order of mutations is seen to happen in a well-defined order for each row, with no + interleaving. For example, if one writer issues the mutation "a=1,b=1,c=1" and + another writer issues the mutation "a=2,b=2,c=2", the row must either + be "a=1,b=1,c=1" or "a=2,b=2,c=2" and must not be something + like "a=1,b=2,c=1".
  10. +
      +
    1. Please note that this is not true _across rows_ for multirow batch mutations.
    2. +
    +
+
+
+ Consistency and Isolation +
    +
  1. All rows returned via any access API will consist of a complete row that existed at + some point in the table's history.
  2. +
  3. This is true across column families - i.e a get of a full row that occurs concurrent + with some mutations 1,2,3,4,5 will return a complete row that existed at some point in time + between mutation i and i+1 for some i between 1 and 5.
  4. +
  5. The state of a row will only move forward through the history of edits to it.
  6. +
+ +
Consistency of Scans +

+ A scan is not a consistent view of a table. Scans do + not exhibit snapshot isolation. +

+

+ Rather, scans have the following properties: +

+ +
    +
  1. + Any row returned by the scan will be a consistent view (i.e. that version + of the complete row existed at some point in time) +
  2. +
  3. + A scan will always reflect a view of the data at least as new as + the beginning of the scan. This satisfies the visibility guarantees + enumerated below.
  4. +
      +
    1. For example, if client A writes data X and then communicates via a side + channel to client B, any scans started by client B will contain data at least + as new as X.
    2. +
    3. A scan _must_ reflect all mutations committed prior to the construction + of the scanner, and _may_ reflect some mutations committed subsequent to the + construction of the scanner.
    4. +
    5. Scans must include all data written prior to the scan (except in + the case where data is subsequently mutated, in which case it _may_ reflect + the mutation)
    6. +
    +
+

+ Those familiar with relational databases will recognize this isolation level as "read committed". +

+

+ Please note that the guarantees listed above regarding scanner consistency + are referring to "transaction commit time", not the "timestamp" + field of each cell. That is to say, a scanner started at time t may see edits + with a timestamp value greater than t, if those edits were committed with a + "forward dated" timestamp before the scanner was constructed. +

+
+
+
+ Visibility +
    +
  1. When a client receives a "success" response for any mutation, that + mutation is immediately visible to both that client and any client with whom it + later communicates through side channels.
  2. +
  3. A row must never exhibit so-called "time-travel" properties. That + is to say, if a series of mutations moves a row sequentially through a series of + states, any sequence of concurrent reads will return a subsequence of those states.
  4. +
      +
    1. For example, if a row's cells are mutated using the "incrementColumnValue" + API, a client must never see the value of any cell decrease.
    2. +
    3. This is true regardless of which read API is used to read back the mutation.
    4. +
    +
  5. Any version of a cell that has been returned to a read operation is guaranteed to + be durably stored.
  6. +
+ +
+
+ Durability +
    +
  1. All visible data is also durable data. That is to say, a read will never return + data that has not been made durable on disk[1]
  2. +
  3. Any operation that returns a "success" code (eg does not throw an exception) + will be made durable.
  4. +
  5. Any operation that returns a "failure" code will not be made durable + (subject to the Atomicity guarantees above)
  6. +
  7. All reasonable failure scenarios will not affect any of the guarantees of this document.
  8. + +
+
+
+ Tunability +

All of the above guarantees must be possible within HBase. For users who would like to trade + off some guarantees for performance, HBase may offer several tuning options. For example:

+
    +
  • Visibility may be tuned on a per-read basis to allow stale reads or time travel.
  • +
  • Durability may be tuned to only flush data to disk on a periodic basis
  • +
+
+
+
+ Footnotes + +

[1] In the context of HBase, "durably on disk" implies an hflush() call on the transaction + log. This does not actually imply an fsync() to magnetic media, but rather just that the data has been + written to the OS cache on all replicas of the log. In the case of a full datacenter power loss, it is + possible that the edits are not truly durable.

+
+ + +
diff --git a/src/site/xdoc/bulk-loads.xml b/src/site/xdoc/bulk-loads.xml new file mode 100644 index 0000000..fc61ebe --- /dev/null +++ b/src/site/xdoc/bulk-loads.xml @@ -0,0 +1,148 @@ + + + + + + + + +
+ + Bulk Loads in HBase + +
+ +
+ Overview +

+ HBase includes several methods of loading data into tables. + The most straightforward method is to either use the TableOutputFormat + class from a MapReduce job, or use the normal client APIs; however, + these are not always the most efficient methods. +

+

+ This document describes HBase's bulk load functionality. The bulk load + feature uses a MapReduce job to output table data in HBase's internal + data format, and then directly loads the data files into a running + cluster. +

+
+
+ Bulk Load Architecture +

+ The HBase bulk load process consists of two main steps. +

+ + Preparing data via a MapReduce job +

+ The first step of a bulk load is to generate HBase data files from + a MapReduce job using HFileOutputFormat. This output format writes + out data in HBase's internal storage format so that they can be + later loaded very efficiently into the cluster. +

+

+ In order to function efficiently, HFileOutputFormat must be configured + such that each output HFile fits within a single region. In order to + do this, jobs use Hadoop's TotalOrderPartitioner class to partition the + map output into disjoint ranges of the key space, corresponding to the + key ranges of the regions in the table. +

+

+ HFileOutputFormat includes a convenience function, configureIncrementalLoad(), + which automatically sets up a TotalOrderPartitioner based on the current + region boundaries of a table. +

+
+ + Completing the data load +

+ After the data has been prepared using HFileOutputFormat, it + is loaded into the cluster using a command line tool. This command line tool + iterates through the prepared data files, and for each one determines the + region the file belongs to. It then contacts the appropriate Region Server + which adopts the HFile, moving it into its storage directory and making + the data available to clients. +

+

+ If the region boundaries have changed during the course of bulk load + preparation, or between the preparation and completion steps, the bulk + load commandline utility will automatically split the data files into + pieces corresponding to the new boundaries. This process is not + optimally efficient, so users should take care to minimize the delay between + preparing a bulk load and importing it into the cluster, especially + if other clients are simultaneously loading data through other means. +

+
+
+
+ Preparing a bulk load using the <code>importtsv</code> tool +

+ HBase ships with a command line tool called importtsv. This tool + is available by running hadoop jar /path/to/hbase-VERSION.jar importtsv. + Running this tool with no arguments prints brief usage information: +

+
+Usage: importtsv -Dimporttsv.columns=a,b,c <tablename> <inputdir>
+
+Imports the given input directory of TSV data into the specified table.
+
+The column names of the TSV data must be specified using the -Dimporttsv.columns
+option. This option takes the form of comma-separated column names, where each
+column name is either a simple column family, or a columnfamily:qualifier. The special
+column name HBASE_ROW_KEY is used to designate that this column should be used
+as the row key for each imported record. You must specify exactly one column
+to be the row key.
+
+In order to prepare data for a bulk data load, pass the option:
+  -Dimporttsv.bulk.output=/path/for/output
+
+Other options that may be specified with -D include:
+  -Dimporttsv.skip.bad.lines=false - fail if encountering an invalid line
+
+
+
+ Importing the prepared data using the <code>completebulkload</code> tool +

+ After a data import has been prepared using the importtsv tool, the + completebulkload tool is used to import the data into the running cluster. +

+

+ The completebulkload tool simply takes the same output path where + importtsv put its results, and the table name. For example: +

+ $ hadoop jar hbase-VERSION.jar completebulkload /user/todd/myoutput mytable +

+ This tool will run quickly, after which point the new data will be visible in + the cluster. +

+
+
+ Advanced Usage +

+ Although the importtsv tool is useful in many cases, advanced users may + want to generate data programatically, or import data from other formats. To get + started doing so, dig into ImportTsv.java and check the JavaDoc for + HFileOutputFormat. +

+

+ The import step of the bulk load can also be done programatically. See the + LoadIncrementalHFiles class for more information. +

+
+ +
\ No newline at end of file diff --git a/src/site/xdoc/cygwin.xml b/src/site/xdoc/cygwin.xml new file mode 100644 index 0000000..5630cff --- /dev/null +++ b/src/site/xdoc/cygwin.xml @@ -0,0 +1,254 @@ + + + + + + +
+ Installing HBase on Windows using Cygwin +
+ + +
+Introduction +

HBase is a distributed, column-oriented store, modeled after Google's BigTable. HBase is built on top of Hadoop for its MapReduce and distributed file system implementation. All these projects are open-source and part of the Apache Software Foundation.

+ +

As being distributed, large scale platforms, the Hadoop and HBase projects mainly focus on *nix environments for production installations. However, being developed in Java, both projects are fully portable across platforms and, hence, also to the Windows operating system. For ease of development the projects rely on Cygwin to have a *nix-like environment on Windows to run the shell scripts.

+
+
+Purpose +

This document explains the intricacies of running HBase on Windows using Cygwin as an all-in-one single-node installation for testing and development. The HBase Overview and QuickStart guides on the other hand go a long way in explaning how to setup HBase in more complex deployment scenario's.

+
+ +
+Installation +

For running HBase on Windows, 3 technologies are required: Java, Cygwin and SSH. The following paragraphs detail the installation of each of the aforementioned technologies.

+
+Java +

HBase depends on the Java Platform, Standard Edition, 6 Release. So the target system has to be provided with at least the Java Runtime Environment (JRE); however if the system will also be used for development, the Jave Development Kit (JDK) is preferred. You can download the latest versions for both from Sun's download page. Installation is a simple GUI wizard that guides you through the process.

+
+
+Cygwin +

Cygwin is probably the oddest technology in this solution stack. It provides a dynamic link library that emulates most of a *nix environment on Windows. On top of that a whole bunch of the most common *nix tools are supplied. Combined, the DLL with the tools form a very *nix-alike environment on Windows.

+ +

For installation, Cygwin provides the setup.exe utility that tracks the versions of all installed components on the target system and provides the mechanism for installing or updating everything from the mirror sites of Cygwin.

+ +

To support installation, the setup.exe utility uses 2 directories on the target system. The Root directory for Cygwin (defaults to C:\cygwin) which will become / within the eventual Cygwin installation; and the Local Package directory (e.g. C:\cygsetup that is the cache where setup.exe stores the packages before they are installed. The cache must not be the same folder as the Cygwin root.

+ +

Perform following steps to install Cygwin, which are elaboratly detailed in the 2nd chapter of the Cygwin User's Guide:

+ +
    +
  1. Make sure you have Administrator privileges on the target system.
  2. +
  3. Choose and create you Root and Local Package directories. A good suggestion is to use C:\cygwin\root and C:\cygwin\setup folders.
  4. +
  5. Download the setup.exe utility and save it to the Local Package directory.
  6. +
  7. Run the setup.exe utility, +
      +
    1. Choose the Install from Internet option,
    2. +
    3. Choose your Root and Local Package folders
    4. +
    5. and select an appropriate mirror.
    6. +
    7. Don't select any additional packages yet, as we only want to install Cygwin for now.
    8. +
    9. Wait for download and install
    10. +
    11. Finish the installation
    12. +
    +
  8. +
  9. Optionally, you can now also add a shortcut to your Start menu pointing to the setup.exe utility in the Local Package folder.
  10. +
  11. Add CYGWIN_HOME system-wide environment variable that points to your Root directory.
  12. +
  13. Add %CYGWIN_HOME%\bin to the end of your PATH environment variable.
  14. +
  15. Reboot the sytem after making changes to the environment variables otherwise the OS will not be able to find the Cygwin utilities.
  16. +
  17. Test your installation by running your freshly created shortcuts or the Cygwin.bat command in the Root folder. You should end up in a terminal window that is running a Bash shell. Test the shell by issuing following commands: +
      +
    1. cd / should take you to thr Root directory in Cygwin;
    2. +
    3. the LS commands that should list all files and folders in the current directory.
    4. +
    5. Use the exit command to end the terminal.
    6. +
    +
  18. +
  19. When needed, to uninstall Cygwin you can simply delete the Root and Local Package directory, and the shortcuts that were created during installation.
  20. +
+
+
SSH +

HBase (and Hadoop) rely on SSH for interprocess/-node communication and launching remote commands. SSH will be provisioned on the target system via Cygwin, which supports running Cygwin programs as Windows services!

+ +
    +
  1. Rerun the setup.exe utility.
  2. +
  3. Leave all parameters as is, skipping through the wizard using the Next button until the Select Packages panel is shown.
  4. +
  5. Maximize the window and click the View button to toggle to the list view, which is ordered alfabetically on Package, making it easier to find the packages we'll need.
  6. +
  7. Select the following packages by clicking the status word (normally Skip) so it's marked for installation. Use the Next button to download and install the packages. +
      +
    1. OpenSSH
    2. +
    3. tcp_wrappers
    4. +
    5. diffutils
    6. +
    7. zlib
    8. +
    +
  8. +
  9. Wait for the install to complete and finish the installation.
  10. +
+
+
+HBase +

Download the latest release of HBase from the website. As the HBase distributable is just a zipped archive, installation is as simple as unpacking the archive so it ends up in its final installation directory. Notice that HBase has to be installed in Cygwin and a good directory suggestion is to use /usr/local/ (or [Root directory]\usr\local in Windows slang). You should end up with a /usr/local/hbase-<version> installation in Cygwin.

+ +This finishes installation. We go on with the configuration. +
+
+
+Configuration +

There are 3 parts left to configure: Java, SSH and HBase itself. Following paragraphs explain eacht topic in detail.

+
+Java +

One important thing to remember in shell scripting in general (i.e. *nix and Windows) is that managing, manipulating and assembling path names that contains spaces can be very hard, due to the need to escape and quote those characters and strings. So we try to stay away from spaces in path names. *nix environments can help us out here very easily by using symbolic links.

+ +
    +
  1. Create a link in /usr/local to the Java home directory by using the following command and substituting the name of your chosen Java environment: +
    LN -s /cygdrive/c/Program\ Files/Java/<jre name> /usr/local/<jre name>
    +
  2. +
  3. Test your java installation by changing directories to your Java folder CD /usr/local/<jre name> and issueing the command ./bin/java -version. This should output your version of the chosen JRE.
  4. +
+
+
+SSH +

Configuring SSH is quite elaborate, but primarily a question of launching it by default as a Windows service.

+ +
    +
  1. On Windows Vista and above make sure you run the Cygwin shell with elevated privileges, by right-clicking on the shortcut an using Run as Administrator.
  2. +
  3. First of all, we have to make sure the rights on some crucial files are correct. Use the commands underneath. You can verify all rights by using the LS -L command on the different files. Also, notice the auto-completion feature in the shell using <TAB> is extremely handy in these situations. +
      +
    1. chmod +r /etc/passwd to make the passwords file readable for all
    2. +
    3. chmod u+w /etc/passwd to make the passwords file writable for the owner
    4. +
    5. chmod +r /etc/group to make the groups file readable for all
    6. +
    +
      +
    1. chmod u+w /etc/group to make the groups file writable for the owner
    2. +
    +
      +
    1. chmod 755 /var to make the var folder writable to owner and readable and executable to all
    2. +
    +
  4. +
  5. Edit the /etc/hosts.allow file using your favorite editor (why not VI in the shell!) and make sure the following two lines are in there before the PARANOID line: +
      +
    1. ALL : localhost 127.0.0.1/32 : allow
    2. +
    3. ALL : [::1]/128 : allow
    4. +
    +
  6. +
  7. Next we have to configure SSH by using the script ssh-host-config +
      +
    1. If this script asks to overwrite an existing /etc/ssh_config, answer yes.
    2. +
    3. If this script asks to overwrite an existing /etc/sshd_config, answer yes.
    4. +
    5. If this script asks to use privilege separation, answer yes.
    6. +
    7. If this script asks to install sshd as a service, answer yes. Make sure you started your shell as Adminstrator!
    8. +
    9. If this script asks for the CYGWIN value, just <enter> as the default is ntsec.
    10. +
    11. If this script asks to create the sshd account, answer yes.
    12. +
    13. If this script asks to use a different user name as service account, answer no as the default will suffice.
    14. +
    15. If this script asks to create the cyg_server account, answer yes. Enter a password for the account.
    16. +
    +
  8. +
  9. Start the SSH service using net start sshd or cygrunsrv --start sshd. Notice that cygrunsrv is the utility that make the process run as a Windows service. Confirm that you see a message stating that the CYGWIN sshd service was started succesfully.
  10. +
  11. Harmonize Windows and Cygwin user account by using the commands: +
      +
    1. mkpasswd -cl > /etc/passwd
    2. +
    3. mkgroup --local > /etc/group
    4. +
    +
  12. +
  13. Test the installation of SSH: +
      +
    1. Open a new Cygwin terminal
    2. +
    3. Use the command whoami to verify your userID
    4. +
    5. Issue an ssh localhost to connect to the system itself +
        +
      1. Answer yes when presented with the server's fingerprint
      2. +
      3. Issue your password when prompted
      4. +
      5. test a few commands in the remote session
      6. +
      7. The exit command should take you back to your first shell in Cygwin
      8. +
      +
    6. +
    7. Exit should terminate the Cygwin shell.
    8. +
    +
  14. +
+
+
+HBase +If all previous configurations are working properly, we just need some tinkering at the HBase config files to properly resolve on Windows/Cygwin. All files and paths referenced here start from the HBase [installation directory] as working directory. +
    +
  1. HBase uses the ./conf/hbase-env.sh to configure its dependencies on the runtime environment. Copy and uncomment following lines just underneath their original, change them to fit your environemnt. They should read something like: +
      +
    1. export JAVA_HOME=/usr/local/<jre name>
    2. +
    3. export HBASE_IDENT_STRING=$HOSTNAME as this most likely does not inlcude spaces.
    4. +
    +
  2. +
  3. HBase uses the ./conf/hbase-default.xml file for configuration. Some properties do not resolve to existing directories because the JVM runs on Windows. This is the major issue to keep in mind when working with Cygwin: within the shell all paths are *nix-alike, hence relative to the root /. However, every parameter that is to be consumed within the windows processes themself, need to be Windows settings, hence C:\-alike. Change following propeties in the configuration file, adjusting paths where necessary to conform with your own installation: +
      +
    1. hbase.rootdir must read e.g. file:///C:/cygwin/root/tmp/hbase/data
    2. +
    3. hbase.tmp.dir must read C:/cygwin/root/tmp/hbase/tmp
    4. +
    5. hbase.zookeeper.quorum must read 127.0.0.1 because for some reason localhost doesn't seem to resolve properly on Cygwin.
    6. +
    +
  4. +
  5. Make sure the configured hbase.rootdir and hbase.tmp.dir directories exist and have the proper rights set up e.g. by issuing a chmod 777 on them.
  6. +
+
+
+
+Testing +

+This should conclude the installation and configuration of HBase on Windows using Cygwin. So it's time to test it. +

    +
  1. Start a Cygwin terminal, if you haven't already.
  2. +
  3. Change directory to HBase installation using CD /usr/local/hbase-<version>, preferably using auto-completion.
  4. +
  5. Start HBase using the command ./bin/start-hbase.sh +
      +
    1. When prompted to accept the SSH fingerprint, answer yes.
    2. +
    3. When prompted, provide your password. Maybe multiple times.
    4. +
    5. When the command completes, the HBase server should have started.
    6. +
    7. However, to be absolutely certain, check the logs in the ./logs directory for any exceptions.
    8. +
    +
  6. +
  7. Next we start the HBase shell using the command ./bin/hbase shell
  8. +
  9. We run some simple test commands +
      +
    1. Create a simple table using command create 'test', 'data'
    2. +
    3. Verify the table exists using the command list
    4. +
    5. Insert data into the table using e.g. +
      put 'test', 'row1', 'data:1', 'value1'
      +put 'test', 'row2', 'data:2', 'value2'
      +put 'test', 'row3', 'data:3', 'value3'
      +
    6. +
    7. List all rows in the table using the command scan 'test' that should list all the rows previously inserted. Notice how 3 new columns where added without changing the schema!
    8. +
    9. Finally we get rid of the table by issuing disable 'test' followed by drop 'test' and verified by list which should give an empty listing.
    10. +
    +
  10. +
  11. Leave the shell by exit
  12. +
  13. To stop the HBase server issue the ./bin/stop-hbase.sh command. And wait for it to complete!!! Killing the process might corrupt your data on disk.
  14. +
  15. In case of problems, +
      +
    1. verify the HBase logs in the ./logs directory.
    2. +
    3. Try to fix the problem
    4. +
    5. Get help on the forums or IRC (#hbase@freenode.net). People are very active and keen to help out!
    6. +
    7. Stopr, restart and retest the server.
    8. +
    +
  16. +
+

+
+ +
+Conclusion +

+Now your HBase server is running, start coding and build that next killer app on this particular, but scalable datastore! +

+
+ +
diff --git a/src/site/xdoc/index.xml b/src/site/xdoc/index.xml new file mode 100644 index 0000000..2b25a89 --- /dev/null +++ b/src/site/xdoc/index.xml @@ -0,0 +1,40 @@ + + + + + + + +
+ HBase Documentation +
+ + +

+ The following documents provide concepts and procedures that will help you + get started using HBase. If you have more questions, you can ask the + mailing list or browse the archives. +

+ + + +
diff --git a/src/site/xdoc/metrics.xml b/src/site/xdoc/metrics.xml new file mode 100644 index 0000000..83389e1 --- /dev/null +++ b/src/site/xdoc/metrics.xml @@ -0,0 +1,152 @@ + + + + + + + + +
+ + HBase Metrics + +
+ + +
+ Introduction +

+ HBase emits Hadoop metrics. +

+
+
+ HOWTO +

First read up on Hadoop metrics. + If you are using ganglia, the GangliaMetrics + wiki page is useful read.

+

To have HBase emit metrics, edit $HBASE_HOME/conf/hadoop-metrics.properties + and enable metric 'contexts' per plugin. As of this writing, hadoop supports + file and ganglia plugins. + Yes, the hbase metrics files is named hadoop-metrics rather than + hbase-metrics because currently at least the hadoop metrics system has the + properties filename hardcoded. Per metrics context, + comment out the NullContext and enable one or more plugins instead. +

+

+ If you enable the hbase context, on regionservers you'll see total requests since last + metric emission, count of regions and storefiles as well as a count of memstore size. + On the master, you'll see a count of the cluster's requests. +

+

+ Enabling the rpc context is good if you are interested in seeing + metrics on each hbase rpc method invocation (counts and time taken). +

+

+ The jvm context is + useful for long-term stats on running hbase jvms -- memory used, thread counts, etc. + As of this writing, if more than one jvm is running emitting metrics, at least + in ganglia, the stats are aggregated rather than reported per instance. +

+
+ +
+ Using with JMX +

+ In addition to the standard output contexts supported by the Hadoop + metrics package, you can also export HBase metrics via Java Management + Extensions (JMX). This will allow viewing HBase stats in JConsole or + any other JMX client. +

+
+ Enable HBase stats collection +

+ To enable JMX support in HBase, first edit + $HBASE_HOME/conf/hadoop-metrics.properties to support + metrics refreshing. (If you've already configured + hadoop-metrics.properties for another output context, + you can skip this step). +

+ +# Configuration of the "hbase" context for null +hbase.class=org.apache.hadoop.metrics.spi.NullContextWithUpdateThread +hbase.period=60 + +# Configuration of the "jvm" context for null +jvm.class=org.apache.hadoop.metrics.spi.NullContextWithUpdateThread +jvm.period=60 + +# Configuration of the "rpc" context for null +rpc.class=org.apache.hadoop.metrics.spi.NullContextWithUpdateThread +rpc.period=60 + +
+
+ Setup JMX remote access +

+ For remote access, you will need to configure JMX remote passwords + and access profiles. Create the files: +

+
+
$HBASE_HOME/conf/jmxremote.passwd (set permissions + to 600)
+
+ +monitorRole monitorpass +controlRole controlpass + +
+ +
$HBASE_HOME/conf/jmxremote.access
+
+ +monitorRole readonly +controlRole readwrite + +
+
+
+
+ Configure JMX in HBase startup +

+ Finally, edit the $HBASE_HOME/conf/hbase-env.sh + script to add JMX support: +

+
+
$HBASE_HOME/conf/hbase-env.sh
+
+

Add the lines:

+ +HBASE_JMX_OPTS="-Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.ssl=false" +HBASE_JMX_OPTS="$HBASE_JMX_OPTS -Dcom.sun.management.jmxremote.password.file=$HBASE_HOME/conf/jmxremote.passwd" +HBASE_JMX_OPTS="$HBASE_JMX_OPTS -Dcom.sun.management.jmxremote.access.file=$HBASE_HOME/conf/jmxremote.access" + +export HBASE_MASTER_OPTS="$HBASE_JMX_OPTS -Dcom.sun.management.jmxremote.port=10101" +export HBASE_REGIONSERVER_OPTS="$HBASE_JMX_OPTS -Dcom.sun.management.jmxremote.port=10102" + +
+
+

+ After restarting the processes you want to monitor, you should now be + able to run JConsole (included with the JDK since JDK 5.0) to view + the statistics via JMX. HBase MBeans are exported under the + hadoop domain in JMX. +

+
+
+ +
diff --git a/src/site/xdoc/site.xml b/src/site/xdoc/site.xml new file mode 100644 index 0000000..6a42647 --- /dev/null +++ b/src/site/xdoc/site.xml @@ -0,0 +1,76 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/site/xdoc/tabs.xml b/src/site/xdoc/tabs.xml new file mode 100644 index 0000000..5af2a23 --- /dev/null +++ b/src/site/xdoc/tabs.xml @@ -0,0 +1,35 @@ + + + + + + + + + + + + +