{
+ private HTable m_table;
+
+ /**
+ * Instantiate a TableRecordWriter with the HBase HClient for writing.
+ *
+ * @param table
+ */
+ public TableRecordWriter(HTable table) {
+ m_table = table;
+ }
+
+ public void close(Reporter reporter)
+ throws IOException {
+ m_table.flushCommits();
+ }
+
+ public void write(ImmutableBytesWritable key,
+ Put value) throws IOException {
+ m_table.put(new Put(value));
+ }
+ }
+
+ @Override
+ @SuppressWarnings("unchecked")
+ public RecordWriter getRecordWriter(FileSystem ignored,
+ JobConf job, String name, Progressable progress) throws IOException {
+
+ // expecting exactly one path
+
+ String tableName = job.get(OUTPUT_TABLE);
+ HTable table = null;
+ try {
+ table = new HTable(new HBaseConfiguration(job), tableName);
+ } catch(IOException e) {
+ LOG.error(e);
+ throw e;
+ }
+ table.setAutoFlush(false);
+ return new TableRecordWriter(table);
+ }
+
+ @Override
+ public void checkOutputSpecs(FileSystem ignored, JobConf job)
+ throws FileAlreadyExistsException, InvalidJobConfException, IOException {
+
+ String tableName = job.get(OUTPUT_TABLE);
+ if(tableName == null) {
+ throw new IOException("Must specify table name");
+ }
+ }
+}
\ No newline at end of file
Index: src/java/org/apache/hadoop/hbase/mapred/package-info.java
===================================================================
--- src/java/org/apache/hadoop/hbase/mapred/package-info.java (revision 0)
+++ src/java/org/apache/hadoop/hbase/mapred/package-info.java (revision 0)
@@ -0,0 +1,128 @@
+/*
+ * Copyright 2010 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+Provides HBase MapReduce
+Input/OutputFormats, a table indexing MapReduce job, and utility
+
+Table of Contents
+
+
+
+
+MapReduce jobs deployed to a MapReduce cluster do not by default have access
+to the HBase configuration under $HBASE_CONF_DIR nor to HBase classes.
+You could add hbase-site.xml to $HADOOP_HOME/conf and add
+hbase-X.X.X.jar to the $HADOOP_HOME/lib and copy these
+changes across your cluster but the cleanest means of adding hbase configuration
+and classes to the cluster CLASSPATH is by uncommenting
+HADOOP_CLASSPATH in $HADOOP_HOME/conf/hadoop-env.sh
+and adding the path to the hbase jar and $HBASE_CONF_DIR directory.
+Then copy the amended configuration around the cluster.
+You'll probably need to restart the MapReduce cluster if you want it to notice
+the new configuration.
+
+
+For example, here is how you would amend hadoop-env.sh adding the
+built hbase jar, hbase conf, and the PerformanceEvaluation class from
+the built hbase test jar to the hadoop CLASSPATH:
+
+# Extra Java CLASSPATH elements. Optional.
+# export HADOOP_CLASSPATH=
+export HADOOP_CLASSPATH=$HBASE_HOME/build/test:$HBASE_HOME/build/hbase-X.X.X.jar:$HBASE_HOME/build/hbase-X.X.X-test.jar:$HBASE_HOME/conf
+
+Expand $HBASE_HOME in the above appropriately to suit your
+local environment.
+
+After copying the above change around your cluster, this is how you would run
+the PerformanceEvaluation MR job to put up 4 clients (Presumes a ready mapreduce
+cluster):
+
+
$HADOOP_HOME/bin/hadoop org.apache.hadoop.hbase.PerformanceEvaluation sequentialWrite 4
+
+The PerformanceEvaluation class wil be found on the CLASSPATH because you
+added $HBASE_HOME/build/test to HADOOP_CLASSPATH
+
+
+Another possibility, if for example you do not have access to hadoop-env.sh or
+are unable to restart the hadoop cluster, is bundling the hbase jar into a mapreduce
+job jar adding it and its dependencies under the job jar lib/
+directory and the hbase conf into a job jar conf/ directory.
+
+
+
+
+HBase can be used as a data source, {@link org.apache.hadoop.hbase.mapred.TableInputFormat TableInputFormat},
+and data sink, {@link org.apache.hadoop.hbase.mapred.TableOutputFormat TableOutputFormat}, for MapReduce jobs.
+Writing MapReduce jobs that read or write HBase, you'll probably want to subclass
+{@link org.apache.hadoop.hbase.mapred.TableMap TableMap} and/or
+{@link org.apache.hadoop.hbase.mapred.TableReduce TableReduce}. See the do-nothing
+pass-through classes {@link org.apache.hadoop.hbase.mapred.IdentityTableMap IdentityTableMap} and
+{@link org.apache.hadoop.hbase.mapred.IdentityTableReduce IdentityTableReduce} for basic usage. For a more
+involved example, see {@link org.apache.hadoop.hbase.mapred.BuildTableIndex BuildTableIndex}
+or review the org.apache.hadoop.hbase.mapred.TestTableMapReduce unit test.
+
+
+Running mapreduce jobs that have hbase as source or sink, you'll need to
+specify source/sink table and column names in your configuration.
+
+Reading from hbase, the TableInputFormat asks hbase for the list of
+regions and makes a map-per-region or mapred.map.tasks maps,
+whichever is smaller (If your job only has two maps, up mapred.map.tasks
+to a number > number of regions). Maps will run on the adjacent TaskTracker
+if you are running a TaskTracer and RegionServer per node.
+Writing, it may make sense to avoid the reduce step and write yourself back into
+hbase from inside your map. You'd do this when your job does not need the sort
+and collation that mapreduce does on the map emitted data; on insert,
+hbase 'sorts' so there is no point double-sorting (and shuffling data around
+your mapreduce cluster) unless you need to. If you do not need the reduce,
+you might just have your map emit counts of records processed just so the
+framework's report at the end of your job has meaning or set the number of
+reduces to zero and use TableOutputFormat. See example code
+below. If running the reduce step makes sense in your case, its usually better
+to have lots of reducers so load is spread across the hbase cluster.
+
+There is also a new hbase partitioner that will run as many reducers as
+currently existing regions. The
+{@link org.apache.hadoop.hbase.mapred.HRegionPartitioner} is suitable
+when your table is large and your upload is not such that it will greatly
+alter the number of existing regions when done; other use the default
+partitioner.
+
+
+
+Sample Row Counter
+See {@link org.apache.hadoop.hbase.mapred.RowCounter}. You should be able to run
+it by doing: % ./bin/hadoop jar hbase-X.X.X.jar. This will invoke
+the hbase MapReduce Driver class. Select 'rowcounter' from the choice of jobs
+offered. You may need to add the hbase conf directory to $HADOOP_HOME/conf/hadoop-env.sh#HADOOP_CLASSPATH
+so the rowcounter gets pointed at the right hbase cluster (or, build a new jar
+with an appropriate hbase-site.xml built into your job jar).
+
+PerformanceEvaluation
+See org.apache.hadoop.hbase.PerformanceEvaluation from hbase src/test. It runs
+a mapreduce job to run concurrent clients reading and writing hbase.
+
+
+*/
+package org.apache.hadoop.hbase.mapred;
\ No newline at end of file
Index: src/java/org/apache/hadoop/hbase/mapred/TableMap.java
===================================================================
--- src/java/org/apache/hadoop/hbase/mapred/TableMap.java (revision 0)
+++ src/java/org/apache/hadoop/hbase/mapred/TableMap.java (revision 0)
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2010 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.mapred;
+
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.hadoop.hbase.client.Result;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.Mapper;
+
+/**
+ * Scan an HBase table to sort by a specified sort column.
+ * If the column does not exist, the record is not passed to Reduce.
+ *
+ * @param WritableComparable key class
+ * @param Writable value class
+ */
+@Deprecated
+public interface TableMap, V extends Writable>
+extends Mapper {
+
+}
\ No newline at end of file
Index: src/java/org/apache/hadoop/hbase/mapred/IdentityTableReduce.java
===================================================================
--- src/java/org/apache/hadoop/hbase/mapred/IdentityTableReduce.java (revision 0)
+++ src/java/org/apache/hadoop/hbase/mapred/IdentityTableReduce.java (revision 0)
@@ -0,0 +1,61 @@
+/**
+ * Copyright 2010 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.mapred;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+
+/**
+ * Write to table each key, record pair
+ */
+@Deprecated
+public class IdentityTableReduce
+extends MapReduceBase
+implements TableReduce {
+ @SuppressWarnings("unused")
+ private static final Log LOG =
+ LogFactory.getLog(IdentityTableReduce.class.getName());
+
+ /**
+ * No aggregation, output pairs of (key, record)
+ * @param key
+ * @param values
+ * @param output
+ * @param reporter
+ * @throws IOException
+ */
+ public void reduce(ImmutableBytesWritable key, Iterator values,
+ OutputCollector output,
+ Reporter reporter)
+ throws IOException {
+
+ while(values.hasNext()) {
+ output.collect(key, values.next());
+ }
+ }
+}
\ No newline at end of file
Index: src/java/org/apache/hadoop/hbase/mapred/Driver.java
===================================================================
--- src/java/org/apache/hadoop/hbase/mapred/Driver.java (revision 0)
+++ src/java/org/apache/hadoop/hbase/mapred/Driver.java (revision 0)
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2010 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.mapred;
+
+import org.apache.hadoop.util.ProgramDriver;
+
+/**
+ * Driver for hbase mapreduce jobs. Select which to run by passing
+ * name of job to this main.
+ */
+@Deprecated
+public class Driver {
+ /**
+ * @param args
+ * @throws Throwable
+ */
+ public static void main(String[] args) throws Throwable {
+ ProgramDriver pgd = new ProgramDriver();
+ pgd.addClass(RowCounter.NAME, RowCounter.class,
+ "Count rows in HBase table");
+ pgd.driver(args);
+ }
+}
\ No newline at end of file