From 650e9d8abfe491518276270104807eac427e80e3 Mon Sep 17 00:00:00 2001 From: xiaowen147 Date: Thu, 19 Oct 2017 23:55:32 +0800 Subject: [PATCH] Add new split algorithm for num string --- .../apache/hadoop/hbase/util/RegionSplitter.java | 63 +++++++++++++++++----- .../hadoop/hbase/util/TestRegionSplitter.java | 49 +++++++++++++++++ 2 files changed, 100 insertions(+), 12 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/RegionSplitter.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/RegionSplitter.java index 3ee593a..92195ca 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/RegionSplitter.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/RegionSplitter.java @@ -660,6 +660,8 @@ public class RegionSplitter { // their simple class name instead of a fully qualified class name. if(splitClassName.equals(HexStringSplit.class.getSimpleName())) { splitClass = HexStringSplit.class; + } else if(splitClassName.equals(DecStringSplit.class.getSimpleName())) { + splitClass = DecStringSplit.class; } else if (splitClassName.equals(UniformSplit.class.getSimpleName())) { splitClass = UniformSplit.class; } else { @@ -893,15 +895,52 @@ public class RegionSplitter { * Since this split algorithm uses hex strings as keys, it is easy to read & * write in the shell but takes up more space and may be non-intuitive. */ - public static class HexStringSplit implements SplitAlgorithm { + public static class HexStringSplit extends NumStringSplit { final static String DEFAULT_MIN_HEX = "00000000"; final static String DEFAULT_MAX_HEX = "FFFFFFFF"; + final static int RADIX_HEX = 16; - String firstRow = DEFAULT_MIN_HEX; - BigInteger firstRowInt = BigInteger.ZERO; - String lastRow = DEFAULT_MAX_HEX; - BigInteger lastRowInt = new BigInteger(lastRow, 16); - int rowComparisonLength = lastRow.length(); + public HexStringSplit() { + super(DEFAULT_MIN_HEX, DEFAULT_MAX_HEX, RADIX_HEX); + } + + } + + /** + * The format of a DecStringSplit region boundary is the ASCII representation of + * reversed sequential number, or any other uniformly distributed decimal value. + * Row are decimal-encoded long values in the range + * "00000000" => "99999999" and are left-padded with zeros to keep the + * same order lexicographically as if they were binary. + */ + public static class DecStringSplit extends NumStringSplit { + final static String DEFAULT_MIN_DEC = "00000000"; + final static String DEFAULT_MAX_DEC = "99999999"; + final static int RADIX_DEC = 10; + + public DecStringSplit() { + super(DEFAULT_MIN_DEC, DEFAULT_MAX_DEC, RADIX_DEC); + } + + } + + public abstract static class NumStringSplit implements SplitAlgorithm { + + String firstRow; + BigInteger firstRowInt; + String lastRow; + BigInteger lastRowInt; + int rowComparisonLength; + int radix; + + NumStringSplit(String minRow, String maxRow, int radix) { + this.firstRow = minRow; + this.lastRow = maxRow; + this.radix = radix; + this.firstRowInt = BigInteger.ZERO; + this.lastRowInt = new BigInteger(lastRow, this.radix); + this.rowComparisonLength = lastRow.length(); + } public byte[] split(byte[] start, byte[] end) { BigInteger s = convertToBigInteger(start); @@ -973,18 +1012,18 @@ public class RegionSplitter { public void setFirstRow(String userInput) { firstRow = userInput; - firstRowInt = new BigInteger(firstRow, 16); + firstRowInt = new BigInteger(firstRow, radix); } public void setLastRow(String userInput) { lastRow = userInput; - lastRowInt = new BigInteger(lastRow, 16); + lastRowInt = new BigInteger(lastRow, radix); // Precondition: lastRow > firstRow, so last's length is the greater rowComparisonLength = lastRow.length(); } public byte[] strToRow(String in) { - return convertToByte(new BigInteger(in, 16)); + return convertToByte(new BigInteger(in, radix)); } public String rowToStr(byte[] row) { @@ -1037,8 +1076,8 @@ public class RegionSplitter { * @param pad padding length * @return byte corresponding to input BigInteger */ - public static byte[] convertToByte(BigInteger bigInteger, int pad) { - String bigIntegerString = bigInteger.toString(16); + public byte[] convertToByte(BigInteger bigInteger, int pad) { + String bigIntegerString = bigInteger.toString(radix); bigIntegerString = StringUtils.leftPad(bigIntegerString, pad, '0'); return Bytes.toBytes(bigIntegerString); } @@ -1060,7 +1099,7 @@ public class RegionSplitter { * @return the corresponding BigInteger */ public BigInteger convertToBigInteger(byte[] row) { - return (row.length > 0) ? new BigInteger(Bytes.toString(row), 16) + return (row.length > 0) ? new BigInteger(Bytes.toString(row), radix) : BigInteger.ZERO; } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestRegionSplitter.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestRegionSplitter.java index aa42616..e4dc438 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestRegionSplitter.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestRegionSplitter.java @@ -40,6 +40,7 @@ import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.testclassification.MiscTests; import org.apache.hadoop.hbase.util.RegionSplitter.HexStringSplit; +import org.apache.hadoop.hbase.util.RegionSplitter.DecStringSplit; import org.apache.hadoop.hbase.util.RegionSplitter.SplitAlgorithm; import org.apache.hadoop.hbase.util.RegionSplitter.UniformSplit; import org.junit.AfterClass; @@ -181,6 +182,54 @@ public class TestRegionSplitter { } /** + * Unit tests for the DecStringSplit algorithm. Makes sure it divides up the + * space of keys in the way that we expect. + */ + @Test + public void unitTestDecStringSplit() { + DecStringSplit splitter = new DecStringSplit(); + // Check splitting while starting from scratch + + byte[][] twoRegionsSplits = splitter.split(2); + assertEquals(1, twoRegionsSplits.length); + assertArrayEquals(twoRegionsSplits[0], "50000000".getBytes()); + + byte[][] threeRegionsSplits = splitter.split(3); + assertEquals(2, threeRegionsSplits.length); + byte[] expectedSplit0 = "33333333".getBytes(); + assertArrayEquals(expectedSplit0, threeRegionsSplits[0]); + byte[] expectedSplit1 = "66666666".getBytes(); + assertArrayEquals(expectedSplit1, threeRegionsSplits[1]); + + // Check splitting existing regions that have start and end points + byte[] splitPoint = splitter.split("10000000".getBytes(), "30000000".getBytes()); + assertArrayEquals("20000000".getBytes(), splitPoint); + + byte[] lastRow = "99999999".getBytes(); + assertArrayEquals(lastRow, splitter.lastRow()); + byte[] firstRow = "00000000".getBytes(); + assertArrayEquals(firstRow, splitter.firstRow()); + + // Halfway between 00... and 20... should be 10... + splitPoint = splitter.split(firstRow, "20000000".getBytes()); + assertArrayEquals(splitPoint, "10000000".getBytes()); + + // Halfway between 79... and 99... should be 89.... + splitPoint = splitter.split("79999999".getBytes(), lastRow); + assertArrayEquals(splitPoint,"89999999".getBytes()); + + // Check splitting region with multiple mappers per region + byte[][] splits = splitter.split("00000000".getBytes(), "30000000".getBytes(), 3, false); + assertEquals(2, splits.length); + assertArrayEquals(splits[0], "10000000".getBytes()); + assertArrayEquals(splits[1], "20000000".getBytes()); + + splits = splitter.split("00000000".getBytes(), "20000000".getBytes(), 2, true); + assertEquals(3, splits.length); + assertArrayEquals(splits[1], "10000000".getBytes()); + } + + /** * Unit tests for the UniformSplit algorithm. Makes sure it divides up the space of * keys in the way that we expect. */ -- 2.9.3.windows.2