From 9aab2b40e45e7588bb112c84f6a6e5ab23935dd3 Mon Sep 17 00:00:00 2001 From: Elliott Clark Date: Thu, 30 May 2013 22:33:52 -0700 Subject: [PATCH] Add IntegrationTestBulkLoad --- .../hbase/mapreduce/IntegrationTestBulkLoad.java | 598 ++++++++++++++++++++ .../hadoop/hbase/mapreduce/HFileOutputFormat.java | 4 + .../apache/hadoop/hbase/util/RegionSplitter.java | 33 ++ .../apache/hadoop/hbase/HBaseTestingUtility.java | 19 + 4 files changed, 654 insertions(+) create mode 100644 hbase-it/src/test/java/org/apache/hadoop/hbase/mapreduce/IntegrationTestBulkLoad.java diff --git hbase-it/src/test/java/org/apache/hadoop/hbase/mapreduce/IntegrationTestBulkLoad.java hbase-it/src/test/java/org/apache/hadoop/hbase/mapreduce/IntegrationTestBulkLoad.java new file mode 100644 index 0000000..808e932 --- /dev/null +++ hbase-it/src/test/java/org/apache/hadoop/hbase/mapreduce/IntegrationTestBulkLoad.java @@ -0,0 +1,598 @@ +/** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.mapreduce; + +import org.apache.commons.lang.RandomStringUtils; +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.IntegrationTestingUtility; +import org.apache.hadoop.hbase.IntegrationTests; +import org.apache.hadoop.hbase.KeyValue; +import org.apache.hadoop.hbase.client.HTable; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.EnvironmentEdge; +import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.apache.hadoop.hbase.util.RegionSplitter; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.io.WritableComparator; +import org.apache.hadoop.io.WritableUtils; +import org.apache.hadoop.mapreduce.InputFormat; +import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Partitioner; +import org.apache.hadoop.mapreduce.RecordReader; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.lib.input.FileSplit; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.UUID; + +import static org.junit.Assert.assertEquals; + +/** + * Test Bulk Load and MR on a distributed cluster. + */ +@Category(IntegrationTests.class) +public class IntegrationTestBulkLoad implements Configurable, Tool { + + private static byte[] CHAIN_FAM = Bytes.toBytes("l"); + private static byte[] SORT_FAM = Bytes.toBytes("s"); + private static byte[] DATA_FAM = Bytes.toBytes("d"); + + private static String CHAIN_LENGTH_KEY = "hbase.IntegrationTestBulkLoad.chainLength"; + private static int CHAIN_LENGTH = 900000; + + private static String NUM_MAPS_KEY = "hbase.IntegrationTestBulkLoad.numMaps"; + private static int NUM_MAPS = 1; + + private static String NUM_IMPORT_ROUNDS_KEY = "hbase.IntegrationTestBulkLoad.numImportRounds"; + private static int NUM_IMPORT_ROUNDS = 1; + + + private static String TABLE_NAME_KEY = "hbase.IntegrationTestBulkLoad.tableName"; + private static String TABLE_NAME = "IntegrationTestBulkLoad"; + + private static IntegrationTestingUtility util; + + private String tableName; + private byte[] tableNameBytes; + + @Test + public void testBulkLoad() throws Exception { + setupTable(); + int numImportRounds = getConf().getInt(NUM_IMPORT_ROUNDS_KEY, NUM_IMPORT_ROUNDS); + for (int i = 0; i < numImportRounds; i++) { + runLinkedListMRJob(i); + } + runCheck(); + } + + private byte[][] getSplits(int numRegions) { + RegionSplitter.UniformSplit split = new RegionSplitter.UniformSplit(); + split.setFirstRow(Bytes.toBytes(0L)); + split.setLastRow(Bytes.toBytes(Long.MAX_VALUE)); + return split.split(numRegions); + } + + private void setupTable() throws IOException { + tableName = getConf().get(TABLE_NAME_KEY, TABLE_NAME); + tableNameBytes = Bytes.toBytes(tableName); + if (util.getHBaseAdmin().tableExists(tableNameBytes)) { + util.deleteTable(tableNameBytes); + } + + util.createTable( + tableNameBytes, + new byte[][]{CHAIN_FAM, SORT_FAM, DATA_FAM}, + getSplits(16) + ); + } + + private void runLinkedListMRJob(int iteration) throws Exception { + String jobName = IntegrationTestBulkLoad.class.getSimpleName() + " - " + + EnvironmentEdgeManager.currentTimeMillis(); + Configuration conf = new Configuration(util.getConfiguration()); + Path p = util.getDataTestDirOnTestFS(tableName + "-" + iteration); + HTable table = new HTable(conf, tableName); + + conf.setBoolean("mapreduce.map.speculative", false); + conf.setBoolean("mapreduce.reduce.speculative", false); + + Job job = new Job(conf); + + job.setJobName(jobName); + + // set the input format so that we can create map tasks with no data input. + job.setInputFormatClass(RandomInputFormat.class); + + // Set the mapper classes. + job.setMapperClass(LinkedListCreationMapper.class); + job.setMapOutputKeyClass(ImmutableBytesWritable.class); + job.setMapOutputValueClass(KeyValue.class); + + // Use the identity reducer + // So nothing to do here. + + // Set this jar. + job.setJarByClass(getClass()); + + // Set where to place the hfiles. + FileOutputFormat.setOutputPath(job, p); + + // Configure the partitioner and other things needed for HFileOutputFormat. + HFileOutputFormat.configureIncrementalLoad(job, table); + + // Run the job making sure it works. + assertEquals(true, job.waitForCompletion(true)); + + // Create a new loader. + LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf); + + // Load the HFiles in. + loader.doBulkLoad(p, table); + + // Delete the files. + util.getTestFileSystem().delete(p, true); + } + + /** + * Class to generate splits. Each split gets a dummy split file. The associated + * RecordReader generates a single random number. + * + * This class is adapted from Hadoop tests. + */ + static class RandomInputFormat extends InputFormat { + public List getSplits(JobContext job) throws IOException { + List result = new ArrayList(); + int numSplits = job.getConfiguration().getInt(NUM_MAPS_KEY, NUM_MAPS); + for (int i = 0; i < numSplits; ++i) { + result.add(new FileSplit(new Path("/tmp", "dummy-split-" + i), 0, 1, null)); + } + return result; + } + + /** + * RecordReader that doesn't read anything. Instead it generates a single random number. + * This is useful for debugging or starting map tasks with no data inpput. + * + * This class is adapted from Hadoop tests. + */ + static class RandomRecordReader extends RecordReader { + Path name; + Text key = null; + LongWritable value = new LongWritable(); + + public RandomRecordReader(Path p) { + name = p; + } + + public void initialize(InputSplit split, + TaskAttemptContext context) + throws IOException, InterruptedException { + + } + + public boolean nextKeyValue() { + if (name != null) { + key = new Text(); + key.set(name.getName()); + name = null; + value.set(new Random().nextLong()); + return true; + } + return false; + } + + public Text getCurrentKey() { + return key; + } + + public LongWritable getCurrentValue() { + return value; + } + + public void close() { + } + + public float getProgress() { + return 0.0f; + } + } + + public RecordReader createRecordReader(InputSplit split, + TaskAttemptContext context) + throws IOException, InterruptedException { + return new RandomRecordReader(((FileSplit) split).getPath()); + } + } + + /** + * Mapper that creates a linked list of KeyValues. + * + * Each map task generates one linked list. + * All lists start on row key 0L. + * All lists should be CHAIN_LENGTH long. + */ + public static class LinkedListCreationMapper + extends Mapper { + + private Random rand = new Random(); + + protected void map(Text key, LongWritable value, Context context) + throws IOException, InterruptedException { + + long chainId = value.get(); + byte[] chainIdArray = Bytes.toBytes(chainId); + long currentRow = 0; + long nextRow = Math.abs(rand.nextLong()); + + int chainLength = context.getConfiguration().getInt(CHAIN_LENGTH_KEY, CHAIN_LENGTH); + + for (long i = 0; i < chainLength; i++) { + byte[] rk = Bytes.toBytes(currentRow); + + // Next link in the chain. + KeyValue linkKv = new KeyValue(rk, CHAIN_FAM, chainIdArray, Bytes.toBytes(nextRow)); + // What link in the chain this is. + KeyValue sortKv = new KeyValue(rk, SORT_FAM, chainIdArray, Bytes.toBytes(i)); + // Added data so that large stores are created. + KeyValue dataKv = new KeyValue(rk, DATA_FAM, chainIdArray, + Bytes.toBytes(RandomStringUtils.randomAlphabetic(100)) + ); + + // Emit the key values. + context.write(new ImmutableBytesWritable(rk), linkKv); + context.write(new ImmutableBytesWritable(rk), sortKv); + context.write(new ImmutableBytesWritable(rk), dataKv); + // Move to the next row. + currentRow = nextRow; + nextRow = Math.abs(rand.nextLong()); + } + } + } + + /** + * Writable class used as the key to group links in the linked list. + * + * Used as the key emited from a pass over the table. + */ + public static class LinkKey implements WritableComparable { + + private Long chainId; + + public Long getOrder() { + return order; + } + + public Long getChainId() { + return chainId; + } + + private Long order; + + public LinkKey() { + + } + + public LinkKey(long chainId, long order) { + this.chainId = chainId; + this.order = order; + } + + @Override + public int compareTo(LinkKey linkKey) { + int res = getChainId().compareTo(linkKey.getChainId()); + if (res == 0) { + res = getOrder().compareTo(linkKey.getOrder()); + } + return res; + } + + @Override + public void write(DataOutput dataOutput) throws IOException { + WritableUtils.writeVLong(dataOutput, chainId); + WritableUtils.writeVLong(dataOutput, order); + } + + @Override + public void readFields(DataInput dataInput) throws IOException { + chainId = WritableUtils.readVLong(dataInput); + order = WritableUtils.readVLong(dataInput); + } + } + + /** + * Writable used as the value emitted from a pass over the hbase table. + */ + public static class LinkChain implements WritableComparable { + + public Long getNext() { + return next; + } + + public Long getRk() { + return rk; + } + + public LinkChain() { + } + + public LinkChain(Long rk, Long next) { + this.rk = rk; + this.next = next; + } + + private Long rk; + private Long next; + + @Override + public int compareTo(LinkChain linkChain) { + int res = getRk().compareTo(linkChain.getRk()); + if (res == 0) { + res = getNext().compareTo(linkChain.getNext()); + } + return res; + } + + @Override + public void write(DataOutput dataOutput) throws IOException { + WritableUtils.writeVLong(dataOutput, rk); + WritableUtils.writeVLong(dataOutput, next); + } + + @Override + public void readFields(DataInput dataInput) throws IOException { + rk = WritableUtils.readVLong(dataInput); + next = WritableUtils.readVLong(dataInput); + } + } + + /** + * Class to figure out what partition to send a link in the chain to. This is based upon + * the linkKey's ChainId. + */ + public static class NaturalKeyPartitioner extends Partitioner { + @Override + public int getPartition(LinkKey linkKey, + LinkChain linkChain, + int numPartitions) { + int hash = linkKey.getChainId().hashCode(); + int partition = hash % numPartitions; + return partition; + } + } + + /** + * Comparator used to figure out if a linkKey should be grouped together. This is based upon the + * linkKey's ChainId. + */ + public static class NaturalKeyGroupingComparator extends WritableComparator { + + protected NaturalKeyGroupingComparator() { + super(LinkKey.class, true); + } + + public int compare(WritableComparable w1, WritableComparable w2) { + LinkKey k1 = (LinkKey) w1; + LinkKey k2 = (LinkKey) w2; + + return k1.getChainId().compareTo(k2.getChainId()); + } + } + + /** + * Comparator used to order linkKeys so that they are passed to a reducer in order. This is based + * upon linkKey ChainId and Order. + */ + public static class CompositeKeyComparator extends WritableComparator { + + protected CompositeKeyComparator() { + super(LinkKey.class, true); + } + + @Override + public int compare(WritableComparable w1, WritableComparable w2) { + LinkKey k1 = (LinkKey) w1; + LinkKey k2 = (LinkKey) w2; + + return k1.compareTo(k2); + } + } + + /** + * Mapper to pass over the table. + * + * For every row there could be multiple chains that landed on this row. So emit a linkKey + * and value for each. + */ + public static class LinkedListCheckingMapper extends TableMapper { + protected void map(ImmutableBytesWritable key, Result value, Context context) + throws IOException, InterruptedException { + long longRk = Bytes.toLong(value.getRow()); + + for (Map.Entry entry : value.getFamilyMap(CHAIN_FAM).entrySet()) { + long chainId = Bytes.toLong(entry.getKey()); + long next = Bytes.toLong(entry.getValue()); + long order = Bytes.toLong(value.getColumn(SORT_FAM, entry.getKey()).get(0).getValue()); + context.write(new LinkKey(chainId, order), new LinkChain(longRk, next)); + } + } + } + + /** + * Class that does the actual checking of the links. + * + * All links in the chain should be grouped and sorted when sent to this class. Then the chain + * will be traversed making sure that no link is missing and that the chain is the correct length. + * + * This will throw an exception if anything is not correct. That causes the job to fail if any + * data is corrupt. + */ + public static class LinkedListCheckingReducer + extends Reducer { + protected void reduce(LinkKey key, Iterable values, Context context) + throws java.io.IOException, java.lang.InterruptedException { + long next = -1L; + long count = 0L; + + for (LinkChain lc : values) { + + if (next == -1) { + if (lc.getRk() != 0L) throw new RuntimeException("Chains should all start at 0 rk"); + next = lc.getNext(); + } else { + if (next != lc.getRk()) + throw new RuntimeException("Missing a link in the chain. Expecthing " + + next + " got " + lc.getRk()); + next = lc.getNext(); + } + count++; + } + + int expectedChainLen = context.getConfiguration().getInt(CHAIN_LENGTH_KEY, CHAIN_LENGTH); + if (count != expectedChainLen) + throw new RuntimeException("Chain wasn't the correct length. Expected " + + expectedChainLen + " got " + count); + } + } + + /** + * After adding data to the table start a mr job to + * @throws IOException + * @throws ClassNotFoundException + * @throws InterruptedException + */ + private void runCheck() throws IOException, ClassNotFoundException, InterruptedException { + Configuration conf = getConf(); + String jobName = tableName + "_check" + EnvironmentEdgeManager.currentTimeMillis(); + Path p = util.getDataTestDirOnTestFS(jobName); + + Job job = new Job(conf); + + job.setJarByClass(getClass()); + + job.setPartitionerClass(NaturalKeyPartitioner.class); + job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class); + job.setSortComparatorClass(CompositeKeyComparator.class); + + Scan s = new Scan(); + s.addFamily(CHAIN_FAM); + s.addFamily(SORT_FAM); + s.setMaxVersions(1); + s.setCacheBlocks(false); + s.setBatch(100); + + TableMapReduceUtil.initTableMapperJob( + Bytes.toBytes(tableName), + new Scan(), + LinkedListCheckingMapper.class, + LinkKey.class, + LinkChain.class, + job + ); + + job.setReducerClass(LinkedListCheckingReducer.class); + job.setOutputKeyClass(NullWritable.class); + job.setOutputValueClass(NullWritable.class); + + FileOutputFormat.setOutputPath(job, p); + + assertEquals(true, job.waitForCompletion(true)); + + // Delete the files. + util.getTestFileSystem().delete(p, true); + } + + @BeforeClass + public static void provisionCluster() throws Exception { + if (null == util) { + util = new IntegrationTestingUtility(); + } + + util.initializeCluster(1); + + // Scale this up on a real cluster + if (util.isDistributedCluster()) { + util.getConfiguration().setIfUnset(NUM_MAPS_KEY, + Integer.toString(util.getHBaseAdmin().getClusterStatus().getServersSize() * 20) + ); + util.getConfiguration().setIfUnset(NUM_IMPORT_ROUNDS_KEY, "3"); + } else { + util.startMiniMapReduceCluster(); + } + } + + @AfterClass + public static void releaseCluster() throws Exception { + util.restoreCluster(); + util = null; + } + + @Override + public int run(String[] args) throws Exception { + provisionCluster(); + testBulkLoad(); + releaseCluster(); + return 0; + } + + public void setConf(Configuration conf) { + if (util != null) { + throw new IllegalArgumentException("setConf not supported after the cluster has been started."); + } + util = new IntegrationTestingUtility(conf); + } + + @Override + public Configuration getConf() { + return util.getConfiguration(); + } + + public static void main(String[] args) throws Exception { + Configuration conf = HBaseConfiguration.create(); + IntegrationTestingUtility.setUseDistributedCluster(conf); + int status = ToolRunner.run(conf, new IntegrationTestBulkLoad(), args); + System.exit(status); + } + +} \ No newline at end of file diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/HFileOutputFormat.java hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/HFileOutputFormat.java index f86ea26..4d69e60 100644 --- hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/HFileOutputFormat.java +++ hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/HFileOutputFormat.java @@ -336,6 +336,9 @@ public class HFileOutputFormat extends FileOutputFormat startKeys = getRegionStartKeys(table); @@ -349,6 +352,7 @@ public class HFileOutputFormat extends FileOutputFormat