From 97a33f0258760a8751e2f30d794ab7eb0ee0f0a6 Mon Sep 17 00:00:00 2001 From: Elliott Clark Date: Thu, 30 May 2013 22:33:52 -0700 Subject: [PATCH] IntegrationTestBulkLoad --- .../hbase/mapreduce/IntegrationTestBulkLoad.java | 506 ++++++++++++++++++++ .../hadoop/hbase/mapreduce/HFileOutputFormat.java | 3 + .../apache/hadoop/hbase/util/RegionSplitter.java | 25 + .../apache/hadoop/hbase/HBaseTestingUtility.java | 19 + 4 files changed, 553 insertions(+) create mode 100644 hbase-it/src/test/java/org/apache/hadoop/hbase/mapreduce/IntegrationTestBulkLoad.java diff --git hbase-it/src/test/java/org/apache/hadoop/hbase/mapreduce/IntegrationTestBulkLoad.java hbase-it/src/test/java/org/apache/hadoop/hbase/mapreduce/IntegrationTestBulkLoad.java new file mode 100644 index 0000000..c3376fd --- /dev/null +++ hbase-it/src/test/java/org/apache/hadoop/hbase/mapreduce/IntegrationTestBulkLoad.java @@ -0,0 +1,506 @@ +package org.apache.hadoop.hbase.mapreduce; + +import org.apache.commons.lang.RandomStringUtils; +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.IntegrationTestingUtility; +import org.apache.hadoop.hbase.IntegrationTests; +import org.apache.hadoop.hbase.KeyValue; +import org.apache.hadoop.hbase.client.HTable; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.RegionSplitter; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.io.WritableComparator; +import org.apache.hadoop.io.WritableUtils; +import org.apache.hadoop.mapreduce.InputFormat; +import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Partitioner; +import org.apache.hadoop.mapreduce.RecordReader; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.lib.input.FileSplit; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.UUID; + +import static org.junit.Assert.assertEquals; + +@Category(IntegrationTests.class) +public class IntegrationTestBulkLoad implements Configurable, Tool { + + private static byte[] CHAIN_FAM = Bytes.toBytes("l"); + private static byte[] SORT_FAM = Bytes.toBytes("s"); + private static byte[] DATA_FAM = Bytes.toBytes("d"); + + private static String CHAIN_LENGTH_KEY = "hbase.IntegrationTestBulkLoad.chainLength"; + private static int CHAIN_LENGTH = 200000; + + private static String NUM_MAPS_KEY = "hbase.IntegrationTestBulkLoad.numMaps"; + private static int NUM_MAPS = 1; + + private static String NUM_IMPORT_ROUNDS_KEY = "hbase.IntegrationTestBulkLoad.numImportRounds"; + private static int NUM_IMPORT_ROUNDS = 1; + + + private static String TABLE_NAME_KEY = "hbase.IntegrationTestBulkLoad.tableName"; + private static String TABLE_NAME = "IntegrationTestBulkLoad"; + + private static IntegrationTestingUtility util; + + private String tableName; + + @Test + public void testBulkLoad() throws Exception { + setupTable(); + int numImportRounds = getConf().getInt(NUM_IMPORT_ROUNDS_KEY, NUM_IMPORT_ROUNDS); + for (int i = 0; i < numImportRounds; i++) { + runLinkedListMRJob(i); + } + runCheck(); + } + + private byte[][] getSplits(int numRegions) { + RegionSplitter.UniformSplit split = new RegionSplitter.UniformSplit(); + split.setFirstRow(Bytes.toBytes(0l)); + split.setLastRow(Bytes.toBytes(Long.MAX_VALUE)); + return split.split(numRegions); + } + + private void setupTable() throws IOException { + tableName = getConf().get(TABLE_NAME_KEY, TABLE_NAME); + + util.createTable( + Bytes.toBytes(tableName), + new byte[][]{CHAIN_FAM, SORT_FAM, DATA_FAM}, + getSplits(16) + ); + } + + private void runLinkedListMRJob(int iteration) throws Exception { + UUID uuid = UUID.randomUUID(); + String jobName = IntegrationTestBulkLoad.class.getSimpleName() + " - " + uuid.toString(); + Configuration conf = new Configuration(util.getConfiguration()); + Path p = util.getDataTestDirOnTestFS(tableName + "-" + iteration); + HTable table = new HTable(conf, tableName); + + conf.setBoolean("mapreduce.map.speculative", false); + conf.setBoolean("mapreduce.reduce.speculative", false); + + Job job = new Job(conf); + + job.setJobName(jobName); + + job.setInputFormatClass(RandomInputFormat.class); + + job.setMapperClass(LinkedListCreationMapper.class); + job.setMapOutputKeyClass(ImmutableBytesWritable.class); + job.setMapOutputValueClass(KeyValue.class); + + // Use the identity reducer + // So nothing to do here. + + TableMapReduceUtil.addDependencyJars(job.getConfiguration(), LinkedListCreationMapper.class); + job.setJarByClass(getClass()); + + + FileOutputFormat.setOutputPath(job, p); + + HFileOutputFormat.configureIncrementalLoad(job, table); + + assertEquals(true, job.waitForCompletion(true)); + + LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf); + + loader.doBulkLoad(p, table); + + util.getTestFileSystem().delete(p, true); + } + + static class RandomInputFormat extends InputFormat { + + /** + * Generate the requested number of file splits, with the filename + * set to the filename of the output file. + */ + public List getSplits(JobContext job) throws IOException { + List result = new ArrayList(); + int numSplits = job.getConfiguration().getInt(NUM_MAPS_KEY, NUM_MAPS); + for (int i = 0; i < numSplits; ++i) { + result.add(new FileSplit(new Path("/tmp", "dummy-split-" + i), 0, 1, null)); + } + return result; + } + + static class RandomRecordReader extends RecordReader { + Path name; + Text key = null; + LongWritable value = new LongWritable(); + + public RandomRecordReader(Path p) { + name = p; + } + + public void initialize(InputSplit split, + TaskAttemptContext context) + throws IOException, InterruptedException { + + } + + public boolean nextKeyValue() { + if (name != null) { + key = new Text(); + key.set(name.getName()); + name = null; + value.set(new Random().nextLong()); + return true; + } + return false; + } + + public Text getCurrentKey() { + return key; + } + + public LongWritable getCurrentValue() { + return value; + } + + public void close() { + } + + public float getProgress() { + return 0.0f; + } + } + + public RecordReader createRecordReader(InputSplit split, + TaskAttemptContext context) + throws IOException, InterruptedException { + return new RandomRecordReader(((FileSplit) split).getPath()); + } + } + + public static class LinkedListCreationMapper + extends Mapper { + + Random rand = new Random(); + + protected void map(Text key, LongWritable value, Context context) + throws IOException, InterruptedException { + long chainId = value.get(); + byte[] chainIdArray = Bytes.toBytes(chainId); + long currentRow = 0; + long nextRow = Math.abs(rand.nextLong()); + + int chainLength = context.getConfiguration().getInt(CHAIN_LENGTH_KEY, CHAIN_LENGTH); + + for (long i = 0; i < chainLength; i++) { + byte[] rk = Bytes.toBytes(currentRow); + KeyValue linkKv = new KeyValue(rk, CHAIN_FAM, chainIdArray, Bytes.toBytes(nextRow)); + KeyValue sortKv = new KeyValue(rk, SORT_FAM, chainIdArray, Bytes.toBytes(i)); + KeyValue + dataKv = + new KeyValue(rk, + DATA_FAM, + chainIdArray, + Bytes.toBytes(RandomStringUtils.randomAlphabetic(500))); + + + context.write(new ImmutableBytesWritable(rk), linkKv); + context.write(new ImmutableBytesWritable(rk), sortKv); + context.write(new ImmutableBytesWritable(rk), dataKv); + + currentRow = nextRow; + nextRow = Math.abs(rand.nextLong()); + } + } + } + + public static class LinkKey implements WritableComparable { + + private Long chainId; + + public Long getOrder() { + return order; + } + + public Long getChainId() { + return chainId; + } + + private Long order; + + public LinkKey() { + + } + + public LinkKey(long chainId, long order) { + this.chainId = chainId; + this.order = order; + } + + @Override + public int compareTo(LinkKey linkKey) { + int res = getChainId().compareTo(linkKey.getChainId()); + if (res == 0) { + res = getOrder().compareTo(linkKey.getOrder()); + } + return res; + } + + @Override + public void write(DataOutput dataOutput) throws IOException { + WritableUtils.writeVLong(dataOutput, chainId); + WritableUtils.writeVLong(dataOutput, order); + } + + @Override + public void readFields(DataInput dataInput) throws IOException { + chainId = WritableUtils.readVLong(dataInput); + order = WritableUtils.readVLong(dataInput); + } + } + + public static class LinkChain implements WritableComparable { + + public Long getNext() { + return next; + } + + public Long getRk() { + return rk; + } + + public LinkChain() { + } + + public LinkChain(Long rk, Long next) { + this.rk = rk; + this.next = next; + } + + private Long rk; + private Long next; + + @Override + public int compareTo(LinkChain linkChain) { + int res = getRk().compareTo(linkChain.getRk()); + if (res == 0) { + res = getNext().compareTo(linkChain.getNext()); + } + return res; + } + + @Override + public void write(DataOutput dataOutput) throws IOException { + WritableUtils.writeVLong(dataOutput, rk); + WritableUtils.writeVLong(dataOutput, next); + } + + @Override + public void readFields(DataInput dataInput) throws IOException { + rk = WritableUtils.readVLong(dataInput); + next = WritableUtils.readVLong(dataInput); + } + } + + public static class NaturalKeyPartitioner extends Partitioner { + + @Override + public int getPartition(LinkKey linkKey, + LinkChain linkChain, + int numPartitions) { + int hash = linkKey.getChainId().hashCode(); + int partition = hash % numPartitions; + return partition; + } + } + + public static class NaturalKeyGroupingComparator extends WritableComparator { + + protected NaturalKeyGroupingComparator() { + super(LinkKey.class, true); + } + + public int compare(WritableComparable w1, WritableComparable w2) { + LinkKey k1 = (LinkKey) w1; + LinkKey k2 = (LinkKey) w2; + + return k1.getChainId().compareTo(k2.getChainId()); + } + } + + public static class CompositeKeyComparator extends WritableComparator { + + protected CompositeKeyComparator() { + super(LinkKey.class, true); + } + + @Override + public int compare(WritableComparable w1, WritableComparable w2) { + LinkKey k1 = (LinkKey) w1; + LinkKey k2 = (LinkKey) w2; + + return k1.compareTo(k2); + } + } + + public static class LinkedListCheckingMapper extends TableMapper { + protected void map(ImmutableBytesWritable key, Result value, Context context) + throws IOException, InterruptedException { + long longRk = Bytes.toLong(value.getRow()); + + for (Map.Entry entry : value.getFamilyMap(CHAIN_FAM).entrySet()) { + long chainId = Bytes.toLong(entry.getKey()); + long next = Bytes.toLong(entry.getValue()); + long order = Bytes.toLong(value.getColumn(SORT_FAM, entry.getKey()).get(0).getValue()); + context.write(new LinkKey(chainId, order), new LinkChain(longRk, next)); + } + } + } + + public static class LinkedListCheckingReducer + extends Reducer { + protected void reduce(LinkKey key, Iterable values, Context context) + throws java.io.IOException, java.lang.InterruptedException { + long next = -1l; + long count = 0l; + + for (LinkChain lc : values) { + + if (next == -1) { + if (lc.getRk() != 0l) throw new RuntimeException("Chains should all start at 0 rk"); + next = lc.getNext(); + } else { + if (next != lc.getRk()) + throw new RuntimeException("Missing a link in the chain. Expecthing " + + next + + " got " + + lc.getRk()); + next = lc.getNext(); + } + count++; + } + + int expectedChainLen = context.getConfiguration().getInt(CHAIN_LENGTH_KEY, CHAIN_LENGTH); + if (count != expectedChainLen) + throw new RuntimeException("Chain wasn't the correct length. Expected " + + expectedChainLen + + " got " + + count); + } + } + + private void runCheck() throws IOException, ClassNotFoundException, InterruptedException { + Configuration conf = getConf(); + + Path p = util.getDataTestDirOnTestFS(tableName + "_check"); + + + Job job = new Job(conf); + + job.setJarByClass(getClass()); + + job.setPartitionerClass(NaturalKeyPartitioner.class); + job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class); + job.setSortComparatorClass(CompositeKeyComparator.class); + + Scan s = new Scan(); + s.addFamily(CHAIN_FAM); + s.addFamily(SORT_FAM); + s.setMaxVersions(1); + s.setCacheBlocks(false); + s.setBatch(100); + + TableMapReduceUtil.initTableMapperJob( + Bytes.toBytes(tableName), + new Scan(), + LinkedListCheckingMapper.class, + LinkKey.class, + LinkChain.class, + job + ); + + job.setReducerClass(LinkedListCheckingReducer.class); + job.setOutputKeyClass(NullWritable.class); + job.setOutputValueClass(NullWritable.class); + + FileOutputFormat.setOutputPath(job, p); + + assertEquals(true, job.waitForCompletion(true)); + } + + @BeforeClass + public static void provisionCluster() throws Exception { + if (null == util) { + util = new IntegrationTestingUtility(); + } + util.initializeCluster(1); + + if (util.isDistributedCluster()) { + util.getConfiguration().setIfUnset(NUM_MAPS_KEY, "100"); + util.getConfiguration().setIfUnset(NUM_IMPORT_ROUNDS_KEY, "3"); + } else { + util.startMiniMapReduceCluster(); + } + } + + @AfterClass + public static void releaseCluster() throws Exception { + util.restoreCluster(); + util = null; + } + + @Override + public int run(String[] args) throws Exception { + provisionCluster(); + testBulkLoad(); + releaseCluster(); + return 0; + } + + + public void setConf(Configuration conf) { + if (util != null) { + throw new IllegalArgumentException("setConf not supported after the cluster has been started."); + } + util = new IntegrationTestingUtility(conf); + } + + @Override + public Configuration getConf() { + return util.getConfiguration(); + } + + public static void main(String[] args) throws Exception { + Configuration conf = HBaseConfiguration.create(); + IntegrationTestingUtility.setUseDistributedCluster(conf); + int status = ToolRunner.run(conf, new IntegrationTestBulkLoad(), args); + System.exit(status); + } + +} \ No newline at end of file diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/HFileOutputFormat.java hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/HFileOutputFormat.java index f86ea26..45c76fe 100644 --- hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/HFileOutputFormat.java +++ hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/HFileOutputFormat.java @@ -336,6 +336,9 @@ public class HFileOutputFormat extends FileOutputFormat startKeys = getRegionStartKeys(table); diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/util/RegionSplitter.java hbase-server/src/main/java/org/apache/hadoop/hbase/util/RegionSplitter.java index 84b5f6b..1c632b9 100644 --- hbase-server/src/main/java/org/apache/hadoop/hbase/util/RegionSplitter.java +++ hbase-server/src/main/java/org/apache/hadoop/hbase/util/RegionSplitter.java @@ -230,6 +230,10 @@ public class RegionSplitter { * @return the separator character to use when storing / printing the row */ String separator(); + + void setFirstRow(byte[] userInput); + + void setLastRow(byte[] userInput); } /** @@ -872,6 +876,16 @@ public class RegionSplitter { return " "; } + @Override + public void setFirstRow(byte[] userInput) { + firstRow = Bytes.toString(userInput); + } + + @Override + public void setLastRow(byte[] userInput) { + lastRow = Bytes.toString(userInput); + } + /** * Divide 2 numbers in half (for split algorithm) * @@ -992,6 +1006,17 @@ public class RegionSplitter { lastRowBytes = Bytes.toBytesBinary(userInput); } + + @Override + public void setFirstRow(byte[] userInput) { + firstRowBytes = userInput; + } + + @Override + public void setLastRow(byte[] userInput) { + lastRowBytes = userInput; + } + @Override public byte[] strToRow(String input) { return Bytes.toBytesBinary(input); diff --git hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java index 8d2d883..438f143 100644 --- hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java +++ hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java @@ -1213,6 +1213,25 @@ public class HBaseTestingUtility extends HBaseCommonTestingUtility { } /** + * Create a table. + * @param tableName + * @param families + * @param splitRows + * @return An HTable instance for the created table. + * @throws IOException + */ + public HTable createTable(byte[] tableName, byte[][] families, byte[][] splitRows) + throws IOException { + HTableDescriptor desc = new HTableDescriptor(tableName); + for(byte[] family:families) { + HColumnDescriptor hcd = new HColumnDescriptor(family); + desc.addFamily(hcd); + } + getHBaseAdmin().createTable(desc, splitRows); + return new HTable(getConfiguration(), tableName); + } + + /** * Drop an existing table * @param tableName existing table */ -- 1.7.10.2 (Apple Git-33)